apache · Ngone51 · Jul 2, 2019 · Jul 4, 2019 · Jul 4, 2019 · Jul 4, 2019
diff --git a/.gitignore b/.gitignore
@@ -70,6 +70,7 @@ scalastyle-on-compile.generated.xml
 scalastyle-output.xml
 scalastyle.txt
 spark-*-bin-*.tgz
+spark-resources/
 spark-tests.log
 src_managed/
 streaming-tests.log

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -380,6 +380,17 @@ class SparkContext(config: SparkConf) extends Logging {
 
     val resourcesFileOpt = conf.get(DRIVER_RESOURCES_FILE)
     _resources = getOrDiscoverAllResources(_conf, SPARK_DRIVER_PREFIX, resourcesFileOpt)
+    // driver submitted in client mode under Standalone may have conflict resources with
+    // workers on this host. We should sync driver's resources info into SPARK_RESOURCES
+    // to avoid collision.
+    if (deployMode == "client" && (master.startsWith("spark://")
+      || master.startsWith("local-cluster"))) {
+      val requests = parseAllResourceRequests(_conf, SPARK_DRIVER_PREFIX).map {req =>
+        req.id.resourceName -> req.amount
+      }.toMap
+      // TODO(wuyi) log driver's acquired resources separately ?
+      _resources = acquireResources(_resources, requests)
+    }
 
     // log out spark.app.name in the Spark driver logs
     logInfo(s"Submitted application: $appName")
@@ -1935,6 +1946,7 @@ class SparkContext(config: SparkConf) extends Logging {
     Utils.tryLogNonFatalError {
       _progressBar.foreach(_.stop())
     }
+    releaseResources(_resources)
     _taskScheduler = null
     // TODO: Cache.stop()?
     if (_env != null) {

diff --git a/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala b/core/src/main/scala/org/apache/spark/deploy/ApplicationDescription.scala
@@ -32,7 +32,9 @@ private[spark] case class ApplicationDescription(
     // number of executors this application wants to start with,
     // only used if dynamic allocation is enabled
     initialExecutorLimit: Option[Int] = None,
-    user: String = System.getProperty("user.name", "<unknown>")) {
+    user: String = System.getProperty("user.name", "<unknown>"),
+    // map from resource name to its requested amount by the executor
+    resourceReqsPerExecutor: Map[String, Int] = Map.empty) {
 
   override def toString: String = "ApplicationDescription(" + name + ")"
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -29,6 +29,7 @@ import org.apache.spark.deploy.DeployMessages._
 import org.apache.spark.deploy.master.{DriverState, Master}
 import org.apache.spark.internal.{config, Logging}
 import org.apache.spark.internal.config.Network.RPC_ASK_TIMEOUT
+import org.apache.spark.resource.ResourceUtils
 import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.util.{SparkExitCode, ThreadUtils, Utils}
 
@@ -92,13 +93,15 @@ private class ClientEndpoint(
         val command = new Command(mainClass,
           Seq("{{WORKER_URL}}", "{{USER_JAR}}", driverArgs.mainClass) ++ driverArgs.driverOptions,
           sys.env, classPathEntries, libraryPathEntries, javaOpts)
-
+        val driverResourceReqs = ResourceUtils.parseResourceRequirements(conf,
+          config.SPARK_DRIVER_PREFIX)
         val driverDescription = new DriverDescription(
           driverArgs.jarUrl,
           driverArgs.memory,
           driverArgs.cores,
           driverArgs.supervise,
-          command)
+          command,
+          driverResourceReqs)
         asyncSendToMasterAndForwardReply[SubmitDriverResponse](
           RequestSubmitDriver(driverDescription))
 

diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -24,6 +24,7 @@ import org.apache.spark.deploy.master.{ApplicationInfo, DriverInfo, WorkerInfo}
 import org.apache.spark.deploy.master.DriverState.DriverState
 import org.apache.spark.deploy.master.RecoveryState.MasterState
 import org.apache.spark.deploy.worker.{DriverRunner, ExecutorRunner}
+import org.apache.spark.resource.ResourceInformation
 import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef}
 import org.apache.spark.util.Utils
 
@@ -43,6 +44,7 @@ private[deploy] object DeployMessages {
    * @param memory the memory size of worker
    * @param workerWebUiUrl the worker Web UI address
    * @param masterAddress the master address used by the worker to connect
+   * @param resources the resources of worker
    */
   case class RegisterWorker(
       id: String,
@@ -52,7 +54,8 @@ private[deploy] object DeployMessages {
       cores: Int,
       memory: Int,
       workerWebUiUrl: String,
-      masterAddress: RpcAddress)
+      masterAddress: RpcAddress,
+      resources: Map[String, ResourceInformation] = Map.empty)
     extends DeployMessage {
     Utils.checkHost(host)
     assert (port > 0)
@@ -72,8 +75,10 @@ private[deploy] object DeployMessages {
       exception: Option[Exception])
     extends DeployMessage
 
-  case class WorkerSchedulerStateResponse(id: String, executors: List[ExecutorDescription],
-     driverIds: Seq[String])
+  case class WorkerSchedulerStateResponse(
+      id: String,
+      execWithResources: List[(ExecutorDescription, Map[String, Seq[String]])],
+      driverWithResources: Seq[(String, Map[String, Seq[String]])])
 
   /**
    * A worker will send this message to the master when it registers with the master. Then the
@@ -110,6 +115,12 @@ private[deploy] object DeployMessages {
 
   case class ReconnectWorker(masterUrl: String) extends DeployMessage
 
+  /**
+   * Ask the worker to release the indicated resources in ALLOCATED_RESOURCES_JSON_FILE
+   * @param toRelease the resources expected to release
+   */
+  case class ReleaseResources(toRelease: Map[String, ResourceInformation]) extends DeployMessage
+
   case class KillExecutor(masterUrl: String, appId: String, execId: Int) extends DeployMessage
 
   case class LaunchExecutor(
@@ -118,10 +129,14 @@ private[deploy] object DeployMessages {
       execId: Int,
       appDesc: ApplicationDescription,
       cores: Int,
-      memory: Int)
+      memory: Int,
+      resources: Map[String, Seq[String]])
     extends DeployMessage
 
-  case class LaunchDriver(driverId: String, driverDesc: DriverDescription) extends DeployMessage
+  case class LaunchDriver(
+      driverId: String,
+      driverDesc: DriverDescription,
+      resources: Map[String, Seq[String]] = Map.empty) extends DeployMessage
 
   case class KillDriver(driverId: String) extends DeployMessage
 

diff --git a/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala b/core/src/main/scala/org/apache/spark/deploy/DriverDescription.scala
@@ -22,7 +22,8 @@ private[deploy] case class DriverDescription(
     mem: Int,
     cores: Int,
     supervise: Boolean,
-    command: Command) {
+    command: Command,
+    resourceReqs: Map[String, Int] = Map.empty) {
 
   override def toString: String = s"DriverDescription (${command.mainClass})"
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala
@@ -64,7 +64,8 @@ class LocalSparkCluster(
     /* Start the Workers */
     for (workerNum <- 1 to numWorkers) {
       val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker,
-        memoryPerWorker, masters, null, Some(workerNum), _conf)
+        memoryPerWorker, masters, null, Some(workerNum), _conf,
+        conf.get(config.Worker.SPARK_WORKER_RESOURCE_FILE))
       workerRpcEnvs += workerEnv
     }
 

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala
@@ -82,8 +82,10 @@ private[spark] class ApplicationInfo(
   private[master] def addExecutor(
       worker: WorkerInfo,
       cores: Int,
+      resources: Map[String, Seq[String]],
       useID: Option[Int] = None): ExecutorDesc = {
     val exec = new ExecutorDesc(newExecutorId(useID), this, worker, cores, desc.memoryPerExecutorMB)
+    exec.withResources(resources)
     executors(exec.id) = exec
     coresGranted += cores
     exec

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/DriverInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/DriverInfo.scala
@@ -34,6 +34,9 @@ private[deploy] class DriverInfo(
   @transient var exception: Option[Exception] = None
   /* Most recent worker assigned to this driver */
   @transient var worker: Option[WorkerInfo] = None
+  // resources(e.f. gpu/fpga) allocated to this driver
+  // map from resource name to its addresses
+  private var _resources: Map[String, Seq[String]] = _
 
   init()
 
@@ -47,4 +50,8 @@ private[deploy] class DriverInfo(
     worker = None
     exception = None
   }
+
+  def withResources(r: Map[String, Seq[String]]): Unit = _resources = r
+
+  def resources: Map[String, Seq[String]] = _resources
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala b/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala
@@ -27,6 +27,9 @@ private[master] class ExecutorDesc(
     val memory: Int) {
 
   var state = ExecutorState.LAUNCHING
+  // resources(e.f. gpu/fpga) allocated to this driver
+  // map from resource name to its addresses
+  private var _resources: Map[String, Seq[String]] = _
 
   /** Copy all state (non-val) variables from the given on-the-wire ExecutorDescription. */
   def copyState(execDesc: ExecutorDescription) {
@@ -49,4 +52,8 @@ private[master] class ExecutorDesc(
   override def toString: String = fullId
 
   override def hashCode: Int = toString.hashCode()
+
+  def withResources(r: Map[String, Seq[String]]): Unit = _resources = r
+
+  def resources: Map[String, Seq[String]] = _resources
 }