apache · jiangxb1987 · Apr 15, 2019 · May 15, 2019 · May 21, 2019 · May 21, 2019
diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
@@ -185,6 +185,8 @@ class BarrierTaskContext private[spark] (
     taskContext.getMetricsSources(sourceName)
   }
 
+  override def resources(): Map[String, ResourceInformation] = taskContext.resources()
+
   override private[spark] def killTaskIfInterrupted(): Unit = taskContext.killTaskIfInterrupted()
 
   override private[spark] def getKillReason(): Option[String] = taskContext.getKillReason()

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -507,6 +507,15 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
     }
   }
 
+  /**
+   * Get task resource requirements.
+   */
+  private[spark] def getTaskResourceRequirements(): Map[String, Int] = {
+    getAllWithPrefix(SPARK_TASK_RESOURCE_PREFIX)
+      .withFilter { case (k, v) => k.endsWith(SPARK_RESOURCE_COUNT_SUFFIX)}
+      .map { case (k, v) => (k.dropRight(SPARK_RESOURCE_COUNT_SUFFIX.length), v.toInt)}.toMap
+  }
+
   /**
    * Checks for illegal or deprecated config settings. Throws an exception for the former. Not
    * idempotent - may mutate this conf object to convert deprecated settings to supported ones.
@@ -603,30 +612,6 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
     require(executorTimeoutThresholdMs > executorHeartbeatIntervalMs, "The value of " +
       s"${networkTimeout}=${executorTimeoutThresholdMs}ms must be no less than the value of " +
       s"${EXECUTOR_HEARTBEAT_INTERVAL.key}=${executorHeartbeatIntervalMs}ms.")
-
-    // Make sure the executor resources were specified and are large enough if
-    // any task resources were specified.
-    val taskResourcesAndCount =
-    getAllWithPrefixAndSuffix(SPARK_TASK_RESOURCE_PREFIX, SPARK_RESOURCE_COUNT_SUFFIX).toMap
-    val executorResourcesAndCounts =
-      getAllWithPrefixAndSuffix(SPARK_EXECUTOR_RESOURCE_PREFIX, SPARK_RESOURCE_COUNT_SUFFIX).toMap
-
-    taskResourcesAndCount.foreach { case (rName, taskCount) =>
-      val execCount = executorResourcesAndCounts.get(rName).getOrElse(
-        throw new SparkException(
-          s"The executor resource config: " +
-            s"${SPARK_EXECUTOR_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} " +
-            "needs to be specified since a task requirement config: " +
-            s"${SPARK_TASK_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} was specified")
-      )
-      if (execCount.toLong < taskCount.toLong) {
-        throw new SparkException(
-          s"The executor resource config: " +
-            s"${SPARK_EXECUTOR_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} " +
-            s"= $execCount has to be >= the task config: " +
-            s"${SPARK_TASK_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} = $taskCount")
-      }
-    }
   }
 
   /**

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2707,27 +2707,67 @@ object SparkContext extends Logging {
     // When running locally, don't try to re-execute tasks on failure.
     val MAX_LOCAL_TASK_FAILURES = 1
 
-    // SPARK-26340: Ensure that executor's core num meets at least one task requirement.
-    def checkCpusPerTask(
-      clusterMode: Boolean,
-      maxCoresPerExecutor: Option[Int]): Unit = {
-      val cpusPerTask = sc.conf.get(CPUS_PER_TASK)
-      if (clusterMode && sc.conf.contains(EXECUTOR_CORES)) {
-        if (sc.conf.get(EXECUTOR_CORES) < cpusPerTask) {
-          throw new SparkException(s"${CPUS_PER_TASK.key}" +
-            s" must be <= ${EXECUTOR_CORES.key} when run on $master.")
+    // Ensure that executor's resources satisfies one or more tasks requirement.
+    def checkResourcesPerTask(clusterMode: Boolean, executorCores: Option[Int]): Unit = {
+      val taskCores = sc.conf.get(CPUS_PER_TASK)
+      val execCores = if (clusterMode) {
+        executorCores.getOrElse(sc.conf.get(EXECUTOR_CORES))
+      } else {
+        executorCores.get
+      }
+
+      // Number of cores per executor must meet at least one task requirement.
+      if (execCores < taskCores) {
+        throw new SparkException(s"The number of cores per executor (=$execCores) has to be >= " +
+          s"the task config: ${CPUS_PER_TASK.key} = $taskCores when run on $master.")
+      }
+
+      // Calculate the max slots each executor can provide based on resources available on each
+      // executor and resources required by each task.
+      val taskResourcesAndCount = sc.conf.getTaskResourceRequirements()
+      val executorResourcesAndCounts = sc.conf.getAllWithPrefixAndSuffix(
+        SPARK_EXECUTOR_RESOURCE_PREFIX, SPARK_RESOURCE_COUNT_SUFFIX).toMap
+      val numSlots = (taskResourcesAndCount.map { case (rName, taskCount) =>
+        // Make sure the executor resources were specified through config.
+        val execCount = executorResourcesAndCounts.getOrElse(rName,
+          throw new SparkException(
+            s"The executor resource config: " +
+              s"${SPARK_EXECUTOR_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} " +
+              "needs to be specified since a task requirement config: " +
+              s"${SPARK_TASK_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} was specified")
+        )
+        // Make sure the executor resources are large enough to launch at least one task.
+        if (execCount.toLong < taskCount.toLong) {
+          throw new SparkException(
+            s"The executor resource config: " +
+              s"${SPARK_EXECUTOR_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} " +
+              s"= $execCount has to be >= the task config: " +
+              s"${SPARK_TASK_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} = $taskCount")
         }
-      } else if (maxCoresPerExecutor.isDefined) {
-        if (maxCoresPerExecutor.get < cpusPerTask) {
-          throw new SparkException(s"Only ${maxCoresPerExecutor.get} cores available per executor" +
-            s" when run on $master, and ${CPUS_PER_TASK.key} must be <= it.")
+        execCount.toInt / taskCount
+      }.toList ++ Seq(execCores / taskCores)).min
+      // There have been checks inside SparkConf to make sure the executor resources were specified
+      // and are large enough if any task resources were specified.
+      taskResourcesAndCount.foreach { case (rName, taskCount) =>
+        val execCount = executorResourcesAndCounts(rName)
+        if (execCount.toInt / taskCount.toInt != numSlots) {
+          val message = s"The value of executor resource config: " +
+            s"${SPARK_EXECUTOR_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} " +
+            s"= $execCount is more than that tasks can take: $numSlots * " +
+            s"${SPARK_TASK_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} = $taskCount. " +
+            s"The resources may be wasted."
+          if (Utils.isTesting) {
+            throw new SparkException(message)
+          } else {
+            logWarning(message)
+          }
         }
       }
     }
 
     master match {
       case "local" =>
-        checkCpusPerTask(clusterMode = false, Some(1))
+        checkResourcesPerTask(clusterMode = false, Some(1))
         val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
         val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1)
         scheduler.initialize(backend)
@@ -2740,7 +2780,7 @@ object SparkContext extends Logging {
         if (threadCount <= 0) {
           throw new SparkException(s"Asked to run locally with $threadCount threads")
         }
-        checkCpusPerTask(clusterMode = false, Some(threadCount))
+        checkResourcesPerTask(clusterMode = false, Some(threadCount))
         val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
         val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)
         scheduler.initialize(backend)
@@ -2751,22 +2791,22 @@ object SparkContext extends Logging {
         // local[*, M] means the number of cores on the computer with M failures
         // local[N, M] means exactly N threads with M failures
         val threadCount = if (threads == "*") localCpuCount else threads.toInt
-        checkCpusPerTask(clusterMode = false, Some(threadCount))
+        checkResourcesPerTask(clusterMode = false, Some(threadCount))
         val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)
         val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount)
         scheduler.initialize(backend)
         (backend, scheduler)
 
       case SPARK_REGEX(sparkUrl) =>
-        checkCpusPerTask(clusterMode = true, None)
+        checkResourcesPerTask(clusterMode = true, None)
         val scheduler = new TaskSchedulerImpl(sc)
         val masterUrls = sparkUrl.split(",").map("spark://" + _)
         val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls)
         scheduler.initialize(backend)
         (backend, scheduler)
 
       case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
-        checkCpusPerTask(clusterMode = true, Some(coresPerSlave.toInt))
+        checkResourcesPerTask(clusterMode = true, Some(coresPerSlave.toInt))
         // Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.
         val memoryPerSlaveInt = memoryPerSlave.toInt
         if (sc.executorMemory > memoryPerSlaveInt) {
@@ -2787,7 +2827,7 @@ object SparkContext extends Logging {
         (backend, scheduler)
 
       case masterUrl =>
-        checkCpusPerTask(clusterMode = true, None)
+        checkResourcesPerTask(clusterMode = true, None)
         val cm = getClusterManager(masterUrl) match {
           case Some(clusterMgr) => clusterMgr
           case None => throw new SparkException("Could not parse Master URL: '" + master + "'")

diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -20,7 +20,7 @@ package org.apache.spark
 import java.io.Serializable
 import java.util.Properties
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Evolving}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.metrics.source.Source
@@ -176,6 +176,13 @@ abstract class TaskContext extends Serializable {
    */
   def getLocalProperty(key: String): String
 
+  /**
+   * Resources allocated to the task. The key is the resource name and the value is information
+   * about the resource. Please refer to [[ResourceInformation]] for specifics.
+   */
+  @Evolving
+  def resources(): Map[String, ResourceInformation]
+
   @DeveloperApi
   def taskMetrics(): TaskMetrics
 

diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
@@ -51,7 +51,8 @@ private[spark] class TaskContextImpl(
     localProperties: Properties,
     @transient private val metricsSystem: MetricsSystem,
     // The default value is only used in tests.
-    override val taskMetrics: TaskMetrics = TaskMetrics.empty)
+    override val taskMetrics: TaskMetrics = TaskMetrics.empty,
+    override val resources: Map[String, ResourceInformation] = Map.empty)
   extends TaskContext
   with Logging {
 

diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -38,6 +38,7 @@ import com.google.common.io.{ByteStreams, Files}
 import org.apache.log4j.PropertyConfigurator
 
 import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.internal.config._
 import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
 
@@ -311,6 +312,16 @@ private[spark] object TestUtils {
     current ++ current.filter(_.isDirectory).flatMap(recursiveList)
   }
 
+  /**
+   * Set task resource requirement.
+   */
+  def setTaskResourceRequirement(
+      conf: SparkConf,
+      resourceName: String,
+      resourceCount: Int): SparkConf = {
+    val key = s"${SPARK_TASK_RESOURCE_PREFIX}${resourceName}${SPARK_RESOURCE_COUNT_SUFFIX}"
+    conf.set(key, resourceCount.toString)
+  }
 }
 
 

diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -66,6 +66,13 @@ private[spark] class CoarseGrainedExecutorBackend(
   // to be changed so that we don't share the serializer instance across threads
   private[this] val ser: SerializerInstance = env.closureSerializer.newInstance()
 
+  /**
+   * Map each taskId to the information about the resource allocated to it, Please refer to
+   * [[ResourceInformation]] for specifics.
+   * Exposed for testing only.
+   */
+  private[executor] val taskResources = new mutable.HashMap[Long, Map[String, ResourceInformation]]
+
   override def onStart() {
     logInfo("Connecting to driver: " + driverUrl)
     val resources = parseOrFindResources(resourcesFile)
@@ -151,6 +158,7 @@ private[spark] class CoarseGrainedExecutorBackend(
       } else {
         val taskDesc = TaskDescription.decode(data.value)
         logInfo("Got assigned task " + taskDesc.taskId)
+        taskResources(taskDesc.taskId) = taskDesc.resources
         executor.launchTask(this, taskDesc)
       }
 
@@ -197,7 +205,11 @@ private[spark] class CoarseGrainedExecutorBackend(
   }
 
   override def statusUpdate(taskId: Long, state: TaskState, data: ByteBuffer) {
-    val msg = StatusUpdate(executorId, taskId, state, data)
+    val resources = taskResources.getOrElse(taskId, Map.empty[String, ResourceInformation])
+    val msg = StatusUpdate(executorId, taskId, state, data, resources)
+    if (TaskState.isFinished(state)) {
+      taskResources.remove(taskId)
+    }
     driver match {
       case Some(driverRef) => driverRef.send(msg)
       case None => logWarning(s"Drop $msg because has not yet connected to driver")

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -422,7 +422,8 @@ private[spark] class Executor(
           val res = task.run(
             taskAttemptId = taskId,
             attemptNumber = taskDescription.attemptNumber,
-            metricsSystem = env.metricsSystem)
+            metricsSystem = env.metricsSystem,
+            resources = taskDescription.resources)
           threwException = false
           res
         } {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorResourceInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorResourceInfo.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.scheduler
+
+import scala.collection.mutable.HashMap
+
+import org.apache.spark.SparkException
+
+/**
+ * Class to hold information about a type of Resource on an Executor. This information is managed
+ * by SchedulerBackend, and TaskScheduler shall schedule tasks on idle Executors based on the
+ * information.
+ * @param name Resource name
+ * @param addresses Resource addresses provided by the executor
+ */
+private[spark] class ExecutorResourceInfo(
+    val name: String,
+    private val addresses: Seq[String]) extends Serializable {
+
+  private val addressesMap = new HashMap[String, Boolean]()
+  addresses.foreach(addressesMap.put(_, true))
+
+  /**
+   * Sequence of currently available resource addresses.
+   */
+  def availableAddrs: Seq[String] = addressesMap.toList.filter(_._2 == true).map(_._1)
+
+  /**
+   * Sequence of currently assigned resource addresses.
+   * Exposed for testing only.
+   */
+  private[scheduler] def assignedAddrs: Seq[String] =
+    addressesMap.toList.filter(_._2 == false).map(_._1)
+
+  /**
+   * Acquire a sequence of resource addresses (to a launched task), these addresses must be
+   * available. When the task finishes, it will return the acquired resource addresses.
+   */
+  def acquire(addrs: Seq[String]): Unit = {
+    addrs.foreach { address =>
+      val isAvailable = addressesMap.getOrElse(address, false)
+      if (isAvailable) {
+        addressesMap(address) = false
+      } else {
+        throw new SparkException(s"Try to acquire address that is not available. $name address " +
+          s"$address is not available.")
+      }
+    }
+  }
+
+  /**
+   * Release a sequence of resource addresses, these addresses must have been assigned. Resource
+   * addresses are released when a task has finished.
+   */
+  def release(addrs: Seq[String]): Unit = {
+    addrs.foreach { address =>
+      val isAssigned = addressesMap.getOrElse(address, true)
+      if (!isAssigned) {
+        addressesMap(address) = true
+      } else {
+        throw new SparkException(s"Try to release address that is not assigned. $name address " +
+          s"$address is not assigned.")
+      }
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -74,12 +74,14 @@ private[spark] abstract class Task[T](
    *
    * @param taskAttemptId an identifier for this task attempt that is unique within a SparkContext.
    * @param attemptNumber how many times this task has been attempted (0 for the first attempt)
+   * @param resources other host resources (like gpus) that this task attempt can access
    * @return the result of the task along with updates of Accumulators.
    */
   final def run(
       taskAttemptId: Long,
       attemptNumber: Int,
-      metricsSystem: MetricsSystem): T = {
+      metricsSystem: MetricsSystem,
+      resources: Map[String, ResourceInformation]): T = {
     SparkEnv.get.blockManager.registerTask(taskAttemptId)
     // TODO SPARK-24874 Allow create BarrierTaskContext based on partitions, instead of whether
     // the stage is barrier.
@@ -92,7 +94,8 @@ private[spark] abstract class Task[T](
       taskMemoryManager,
       localProperties,
       metricsSystem,
-      metrics)
+      metrics,
+      resources)
 
     context = if (isBarrier) {
       new BarrierTaskContext(taskContext)