apache · ulysses-you · Mar 24, 2023 · Mar 24, 2023
diff --git a/docs/extensions/engines/spark/rules.md b/docs/extensions/engines/spark/rules.md
diff --git a/...bi-extension-spark-3-3/src/main/scala/org/apache/kyuubi/sql/KyuubiSparkSQLExtension.scala b/...bi-extension-spark-3-3/src/main/scala/org/apache/kyuubi/sql/KyuubiSparkSQLExtension.scala
@@ -17,6 +17,7 @@
 
 package org.apache.kyuubi.sql
 
+import org.apache.spark.FinalStageResourceManager
 import org.apache.spark.sql.SparkSessionExtensions
 
 import org.apache.kyuubi.sql.watchdog.{ForcedMaxOutputRowsRule, MaxPartitionStrategy}
@@ -39,5 +40,7 @@ class KyuubiSparkSQLExtension extends (SparkSessionExtensions => Unit) {
     // watchdog extension
     extensions.injectOptimizerRule(ForcedMaxOutputRowsRule)
     extensions.injectPlannerStrategy(MaxPartitionStrategy)
+
+    extensions.injectQueryStagePrepRule(FinalStageResourceManager)
   }
 }
diff --git a/...yuubi-extension-spark-3-3/src/main/scala/org/apache/spark/CustomResourceProfileExec.scala b/...yuubi-extension-spark-3-3/src/main/scala/org/apache/spark/CustomResourceProfileExec.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import org.apache.spark.network.util.{ByteUnit, JavaUtils}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.resource.{ExecutorResourceRequests, ResourceProfileBuilder}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.execution.{SparkPlan, SQLExecution, UnaryExecNode}
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+import org.apache.kyuubi.sql.KyuubiSQLConf._
+
+/**
+ * This node wraps the final executed plan and inject custom resource profile to the RDD.
+ * It assumes that, the produced RDD would create the `ResultStage` in `DAGScheduler`,
+ * so it makes resource isolation between previous and final stage.
+ *
+ * Note that, Spark does not support config `minExecutors` for each resource profile.
+ * Which means, it would retain `minExecutors` for each resource profile.
+ * So, suggest set `spark.dynamicAllocation.minExecutors` to 0 if enable this feature.
+ */
+case class CustomResourceProfileExec(child: SparkPlan) extends UnaryExecNode {
+  override def output: Seq[Attribute] = child.output
+
+  override def supportsColumnar: Boolean = child.supportsColumnar
+
+  override def supportsRowBased: Boolean = child.supportsRowBased
+
+  private val executorCores = conf.getConf(FINAL_WRITE_STAGE_EXECUTOR_CORES).getOrElse(
+    sparkContext.getConf.getInt("spark.executor.cores", 1))
+  private val executorMemory = conf.getConf(FINAL_WRITE_STAGE_EXECUTOR_MEMORY).getOrElse(
+    sparkContext.getConf.get("spark.executor.memory", "2G"))
+  private val executorMemoryOverhead =
+    conf.getConf(FINAL_WRITE_STAGE_EXECUTOR_MEMORY_OVERHEAD)
+      .getOrElse(sparkContext.getConf.get("spark.executor.memoryOverhead", "1G"))
+  private val executorOffHeapMemory = conf.getConf(FINAL_WRITE_STAGE_EXECUTOR_OFF_HEAP_MEMORY)
+
+  override lazy val metrics: Map[String, SQLMetric] = {
+    val base = Map(
+      "executorCores" -> SQLMetrics.createMetric(sparkContext, "executor cores"),
+      "executorMemory" -> SQLMetrics.createMetric(sparkContext, "executor memory (MiB)"),
+      "executorMemoryOverhead" -> SQLMetrics.createMetric(
+        sparkContext,
+        "executor memory overhead (MiB)"))
+    val addition = executorOffHeapMemory.map(_ =>
+      "executorOffHeapMemory" ->
+        SQLMetrics.createMetric(sparkContext, "executor off heap memory (MiB)")).toMap
+    base ++ addition
+  }
+
+  private def wrapResourceProfile[T](rdd: RDD[T]): RDD[T] = {
+    metrics("executorCores") += executorCores
+    metrics("executorMemory") += JavaUtils.byteStringAs(executorMemory, ByteUnit.MiB)
+    metrics("executorMemoryOverhead") += JavaUtils.byteStringAs(
+      executorMemoryOverhead,
+      ByteUnit.MiB)
+    executorOffHeapMemory.foreach(m =>
+      metrics("executorOffHeapMemory") += JavaUtils.byteStringAs(m, ByteUnit.MiB))
+
+    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)
+
+    val resourceProfileBuilder = new ResourceProfileBuilder()
+    val executorResourceRequests = new ExecutorResourceRequests()
+    executorResourceRequests.cores(executorCores)
+    executorResourceRequests.memory(executorMemory)
+    executorResourceRequests.memoryOverhead(executorMemoryOverhead)
+    executorOffHeapMemory.foreach(executorResourceRequests.offHeapMemory)
+    resourceProfileBuilder.require(executorResourceRequests)
+    rdd.withResources(resourceProfileBuilder.build())
+    rdd
+  }
+
+  override protected def doExecute(): RDD[InternalRow] = {
+    val rdd = child.execute()
+    wrapResourceProfile(rdd)
+  }
+
+  override protected def doExecuteColumnar(): RDD[ColumnarBatch] = {
+    val rdd = child.executeColumnar()
+    wrapResourceProfile(rdd)
+  }
+
+  override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan = {
+    this.copy(child = newChild)
+  }
+}
diff --git a/...yuubi-extension-spark-3-3/src/main/scala/org/apache/spark/FinalStageResourceManager.scala b/...yuubi-extension-spark-3-3/src/main/scala/org/apache/spark/FinalStageResourceManager.scala
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import scala.annotation.tailrec
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SortExec, SparkPlan}
+import org.apache.spark.sql.execution.adaptive._
+import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleExchangeExec}
+
+import org.apache.kyuubi.sql.{KyuubiSQLConf, MarkNumOutputColumnsRule}
+
+/**
+ * This rule assumes the final write stage has less cores requirement than previous, otherwise
+ * this rule would take no effect.
+ *
+ * It provide two features:
+ * 1. Kill redundant executors before running final write stage
+ * 2. Inject custom resource profile for final write stage, so we can specify custom
+ *    executor resource config
+ */
+case class FinalStageResourceManager(session: SparkSession) extends Rule[SparkPlan] {
+  override def apply(plan: SparkPlan): SparkPlan = {
+    if (!conf.getConf(KyuubiSQLConf.FINAL_WRITE_STAGE_RESOURCE_ISOLATION_ENABLED) &&
+      !conf.getConf(KyuubiSQLConf.FINAL_WRITE_STAGE_EAGERLY_KILL_EXECUTORS_ENABLED)) {
+      return plan
+    }
+
+    if (!MarkNumOutputColumnsRule.isWrite(session, plan)) {
+      return plan
+    }
+
+    val sc = session.sparkContext
+    val dra = sc.getConf.getBoolean("spark.dynamicAllocation.enabled", false)
+    val executorCores = sc.getConf.getInt("spark.executor.cores", 1)
+    val minExecutors = sc.getConf.getInt("spark.dynamicAllocation.minExecutors", 0)
+    val maxExecutors = sc.getConf.getInt("spark.dynamicAllocation.maxExecutors", Int.MaxValue)
+    val hasImprovementRoom = maxExecutors - minExecutors > 1
+    // Fast fail if:
+    // 1. resource profile is only supported when dra is enabled
+    // 2. DRA only work with yarn and k8s
+    // 3. logically, dra should kill a lot of executors otherwise it has no benefits.
+    //    32 is a value to make sure we have room for improvement.
+    if (!dra || !sc.schedulerBackend.isInstanceOf[CoarseGrainedSchedulerBackend] ||
+      hasImprovementRoom) {
+      return plan
+    }
+
+    val stage = findFinalRebalanceStage(plan)
+    if (stage.isEmpty) {
+      return plan
+    }
+
+    // Since we are in `prepareQueryStage`, the AQE shuffle read has not been applied.
+    // So we need to apply it by self.
+    val shuffleRead = queryStageOptimizerRules.foldLeft(stage.get.asInstanceOf[SparkPlan]) {
+      case (latest, rule) => rule.apply(latest)
+    }
+    shuffleRead match {
+      case AQEShuffleReadExec(stage: ShuffleQueryStageExec, partitionSpecs) =>
+        val factor = conf.getConf(KyuubiSQLConf.FINAL_WRITE_STAGE_PARTITION_FACTOR)
+        // The condition whether inject custom resource profile:
+        // - target executors * factor < active executors
+        // - target executors > min executors
+        val numActiveExecutors = sc.getExecutorIds().length
+        val expectedCores = partitionSpecs.length
+        val targetExecutors = (expectedCores / executorCores) + 1
+        val hasBenefits = targetExecutors * factor < numActiveExecutors &&
+          targetExecutors > minExecutors
+        if (hasBenefits) {
+          val shuffleId = stage.plan.asInstanceOf[ShuffleExchangeExec].shuffleDependency.shuffleId
+          val numReduce = stage.plan.asInstanceOf[ShuffleExchangeExec].numPartitions
+          // Now, there is only a final stage waiting to execute and all tasks of previous stage
+          // are finished. Here, we kill redundant existed executors eagerly so the tasks of final
+          // stage can be centralized scheduled.
+          if (conf.getConf(KyuubiSQLConf.FINAL_WRITE_STAGE_EAGERLY_KILL_EXECUTORS_ENABLED)) {
+            killExecutors(sc, targetExecutors, shuffleId, numReduce)
+          }
+          if (conf.getConf(KyuubiSQLConf.FINAL_WRITE_STAGE_RESOURCE_ISOLATION_ENABLED)) {
+            // TODO: Logically, We can call `backend.requestTotalExecutors` eagerly
+            //   to reduce the task submit pending time, but it may lose task locality
+            injectCustomResourceProfile(plan, stage.id)
+          } else {
+            plan
+          }
+        } else {
+          logInfo(s"Has no benefits to kill executors or inject custom resource profile, " +
+            s"active executors: $numActiveExecutors, min executor: $minExecutors, " +
+            s"target executors: $targetExecutors.")
+          plan
+        }
+
+      case _ =>
+        plan
+    }
+  }
+
+  /**
+   * The priority of kill executors follow:
+   * 1. kill executor who is younger than other (The older the JIT works better)
+   * 2. kill executor who produces less shuffle data first
+   */
+  private def findExecutorToKill(
+      sc: SparkContext,
+      targetExecutors: Int,
+      shuffleId: Int,
+      numReduce: Int): Seq[String] = {
+    val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
+    val shuffleStatus = tracker.shuffleStatuses(shuffleId)
+    val executorToBlockSize = new mutable.HashMap[String, Long]
+    shuffleStatus.withMapStatuses { mapStatus =>
+      mapStatus.foreach { status =>
+        var i = 0
+        var sum = 0L
+        while (i < numReduce) {
+          sum += status.getSizeForBlock(i)
+          i += 1
+        }
+        executorToBlockSize.getOrElseUpdate(status.location.executorId, sum)
+      }
+    }
+
+    val backend = sc.schedulerBackend.asInstanceOf[CoarseGrainedSchedulerBackend]
+    val executorsWithRegistrationTs = backend.getExecutorsWithRegistrationTs()
+    val existedExecutors = executorsWithRegistrationTs.keys.toSet
+    val expectedNumExecutorToKill = existedExecutors.size - targetExecutors
+    if (expectedNumExecutorToKill < 1) {
+      return Seq.empty
+    }
+
+    val executorIdsToKill = new ArrayBuffer[String]()
+    if (executorToBlockSize.size < expectedNumExecutorToKill) {
+      // The last stage is running fast and finished in a short time. The existed executors are
+      // from previous stages that have not been killed by DRA, so we can not find it by tracking
+      // shuffle status.
+      // We should evict executors by their alive time first and retain all of executors which
+      // have better locality for shuffle block.
+      val numExecutorToKill = expectedNumExecutorToKill - executorToBlockSize.size
+      executorsWithRegistrationTs.toSeq.sortBy(_._2).foreach { case (id, _) =>
+        if (executorIdsToKill.length < numExecutorToKill && !executorToBlockSize.contains(id)) {
+          executorIdsToKill.append(id)
+        }
+      }
+    }
+
+    // Evict the rest executors according to the shuffle block size
+    executorToBlockSize.toSeq.sortBy(_._2).foreach { case (id, _) =>
+      if (executorIdsToKill.length < expectedNumExecutorToKill) {
+        executorIdsToKill.append(id)
+      }
+    }
+
+    executorIdsToKill.toSeq
+  }
+
+  private def killExecutors(
+      sc: SparkContext,
+      targetExecutors: Int,
+      shuffleId: Int,
+      numReduce: Int): Unit = {
+    val executorAllocationClient = sc.schedulerBackend.asInstanceOf[ExecutorAllocationClient]
+
+    val executorsToKill =
+      if (conf.getConf(KyuubiSQLConf.FINAL_WRITE_STAGE_RESOURCE_ISOLATION_ENABLED)) {
+        // If we decide to use custom resource profile, the existed executors have no meaning
+        // any more. So kill all of them.
+        executorAllocationClient.getExecutorIds()
+      } else {
+        findExecutorToKill(sc, targetExecutors, shuffleId, numReduce)
+      }
+
+    logInfo(s"Request to kill executors, total count ${executorsToKill.size}, " +
+      s"[${executorsToKill.mkString(", ")}].")
+
+    // It is a little hack to kill executors with DRA enabled.
+    // It may cause the status in `ExecutorAllocationManager` inconsistent with
+    // `CoarseGrainedSchedulerBackend` for a while. But it should be sync finally.
+    executorAllocationClient.killExecutors(
+      executorIds = executorsToKill,
+      adjustTargetNumExecutors = false,
+      countFailures = false,
+      force = false)
+  }
+
+  private def injectCustomResourceProfile(plan: SparkPlan, id: Int): SparkPlan = {
+    plan match {
+      case stage: ShuffleQueryStageExec if stage.id == id =>
+        CustomResourceProfileExec(stage)
+      case _ => plan.mapChildren(child => injectCustomResourceProfile(child, id))
+    }
+  }
+
+  @tailrec
+  private def findFinalRebalanceStage(plan: SparkPlan): Option[ShuffleQueryStageExec] = {
+    plan match {
+      case p: ProjectExec => findFinalRebalanceStage(p.child)
+      case f: FilterExec => findFinalRebalanceStage(f.child)
+      case s: SortExec if !s.global => findFinalRebalanceStage(s.child)
+      case stage: ShuffleQueryStageExec
+          if stage.isMaterialized &&
+            stage.plan.isInstanceOf[ShuffleExchangeExec] &&
+            stage.plan.asInstanceOf[ShuffleExchangeExec].shuffleOrigin != ENSURE_REQUIREMENTS =>
+        Some(stage)
+      case _ => None
+    }
+  }
+
+  @transient private val queryStageOptimizerRules: Seq[Rule[SparkPlan]] = Seq(
+    OptimizeSkewInRebalancePartitions,
+    CoalesceShufflePartitions(session),
+    OptimizeShuffleWithLocalRead)
+}