apache · tgravescs · Apr 17, 2019 · Apr 17, 2019 · Apr 17, 2019 · Apr 17, 2019
diff --git a/core/src/main/scala/org/apache/spark/ResourceDiscoverer.scala b/core/src/main/scala/org/apache/spark/ResourceDiscoverer.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import java.io.File
+
+import com.fasterxml.jackson.core.JsonParseException
+import org.json4s.{DefaultFormats, MappingException}
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
+import org.apache.spark.util.Utils.executeAndGetOutput
+
+/**
+ * Discovers resources (GPUs/FPGAs/etc).
+ * This class find resources by running and parses the output of the user specified script
+ * from the config spark.{driver/executor}.{resourceType}.discoveryScript.
+ * The output of the script it runs is expected to be JSON in the format of the
+ * ResourceInformation class, with addresses being optional.
+ *
+ * For example:  {"name": "gpu","count":2, "units":"", "addresses": ["0","1"]}
+ */
+private[spark] object ResourceDiscoverer extends Logging {
+
+  private implicit val formats = DefaultFormats
+
+  def findResources(sparkconf: SparkConf, isDriver: Boolean): Map[String, ResourceInformation] = {
+    val prefix = if (isDriver) {
+      SPARK_DRIVER_RESOURCE_PREFIX
+    } else {
+      SPARK_EXECUTOR_RESOURCE_PREFIX
+    }
+    // get unique resource types by grabbing first part config with multiple periods,
+    // ie resourceType.count, grab resourceType part
+    val resourceTypes = sparkconf.getAllWithPrefix(prefix).map { case (k, _) =>
+      k.split('.').head
+    }.toSet
+    resourceTypes.map { rtype => {
+      val rInfo = getResourceAddrsForType(sparkconf, prefix, rtype)
+      (rtype -> rInfo)
+    }}.toMap
+  }
+
+  private def getResourceAddrsForType(
+      sparkconf: SparkConf,
+      prefix: String,
+      resourceType: String): ResourceInformation = {
+    val discoveryConf = prefix + resourceType + SPARK_RESOURCE_DISCOVERY_SCRIPT_POSTFIX
+    val script = sparkconf.getOption(discoveryConf)
+    val result = if (script.nonEmpty) {
+      val scriptFile = new File(script.get)
+      // check that script exists and try to execute
+      if (scriptFile.exists()) {
+        try {
+          val output = executeAndGetOutput(Seq(script.get), new File("."))
+          val parsedJson = parse(output)
+          parsedJson.extract[ResourceInformation]
+        } catch {
+          case e @ (_: SparkException | _: MappingException | _: JsonParseException) =>
+            throw new SparkException(s"Error running the resource discovery script: $scriptFile" +
+              s" for $resourceType", e)
+        }
+      } else {
+        throw new SparkException(s"Resource script: $scriptFile to discover $resourceType" +
+          s" doesn't exist!")
+      }
+    } else {
+      throw new SparkException(s"User is expecting to use $resourceType resources but " +
+        s"didn't specify a script via conf: $discoveryConf, to find them!")
+    }
+    result
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/ResourceInformation.scala b/core/src/main/scala/org/apache/spark/ResourceInformation.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import org.apache.spark.annotation.Evolving
+
+/**
+ * Class to hold information about a type of Resource. A resource could be a GPU, FPGA, etc.
+ * The units are resource specific and could be something like MB or GB for memory.
+ * The array of addresses are resource specific and its up to the user to interpret the address.
+ * The units and addresses could be empty if they don't apply to that resource.
+ *
+ * One example is GPUs, where the addresses would be the indices of the GPUs, the count would be the
+ * number of GPUs and the units would be an empty string.
+ *
+ * @param name the name of the resource
+ * @param units the units of the resources, can be an empty string if units don't apply
+ * @param count the number of resources available
+ * @param addresses an optional array of strings describing the addresses of the resource
+ */
+@Evolving
+case class ResourceInformation(
+    val name: String,
+    val units: String,
+    val count: Long,
+    val addresses: Array[String] = Array.empty)
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.executor
 
+import java.io.{BufferedInputStream, FileInputStream}
 import java.net.URL
 import java.nio.ByteBuffer
 import java.util.Locale
@@ -26,11 +27,18 @@ import scala.collection.mutable
 import scala.util.{Failure, Success}
 import scala.util.control.NonFatal
 
+import com.fasterxml.jackson.databind.exc.MismatchedInputException
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST.JArray
+import org.json4s.MappingException
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark._
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.deploy.worker.WorkerWatcher
 import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config._
 import org.apache.spark.rpc._
 import org.apache.spark.scheduler.{ExecutorLossReason, TaskDescription}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
@@ -44,9 +52,12 @@ private[spark] class CoarseGrainedExecutorBackend(
     hostname: String,
     cores: Int,
     userClassPath: Seq[URL],
-    env: SparkEnv)
+    env: SparkEnv,
+    resourcesFile: Option[String])
   extends ThreadSafeRpcEndpoint with ExecutorBackend with Logging {
 
+  private implicit val formats = DefaultFormats
+
   private[this] val stopping = new AtomicBoolean(false)
   var executor: Executor = null
   @volatile var driver: Option[RpcEndpointRef] = None
@@ -61,7 +72,7 @@ private[spark] class CoarseGrainedExecutorBackend(
       // This is a very fast action so we can use "ThreadUtils.sameThread"
       driver = Some(ref)
       ref.ask[Boolean](RegisterExecutor(executorId, self, hostname, cores, extractLogUrls,
-        extractAttributes))
+        extractAttributes, parseResources(resourcesFile)))
     }(ThreadUtils.sameThread).onComplete {
       // This is a very fast action so we can use "ThreadUtils.sameThread"
       case Success(msg) =>
@@ -71,6 +82,46 @@ private[spark] class CoarseGrainedExecutorBackend(
     }(ThreadUtils.sameThread)
   }
 
+  // visible for testing
+  def parseResources(resourcesFile: Option[String]): Map[String, ResourceInformation] = {
+    // only parse the resources if a task requires them
+    val taskConfPrefix = SPARK_TASK_RESOURCE_PREFIX
+    val resourceInfo = if (env.conf.getAllWithPrefix(taskConfPrefix).nonEmpty) {
+      val resources = resourcesFile.map { resourceFileStr => {
+        val source = new BufferedInputStream(new FileInputStream(resourceFileStr))
+        val resourceMap = try {
+          val parsedJson = parse(source).asInstanceOf[JArray].arr
+          parsedJson.map(_.extract[ResourceInformation]).map(x => (x.name -> x)).toMap
+        } catch {
+          case e @ (_: MappingException | _: MismatchedInputException | _: ClassCastException) =>
+            throw new SparkException(
+              s"Exception parsing the resources passed in: $resourcesFile", e)
+        } finally {
+          source.close()
+        }
+        resourceMap
+      }}.getOrElse(ResourceDiscoverer.findResources(env.conf, false))
+
+      if (resources.isEmpty) {
+        throw new SparkException(s"User specified resources per task via: $taskConfPrefix," +
+          s" but can't find any resources available on the executor.")
+      }
+      logInfo(s"Executor ${executorId} using resources: ${resources.keys}")
+      if (log.isDebugEnabled) {
+        logDebug("===============================================================================")
+        logDebug("Executor Resources:")
+        resources.foreach{ case (k, v) =>
+          logDebug(s"$k -> [name: ${v.name}, units: ${v.units}, count: ${v.count}," +
+            s" addresses: ${v.addresses.deep}]")}
+        logDebug("===============================================================================")
+      }
+      resources
+    } else {
+      Map.empty[String, ResourceInformation]
+    }
+    resourceInfo
+  }
+
   def extractLogUrls: Map[String, String] = {
     val prefix = "SPARK_LOG_URL_"
     sys.env.filterKeys(_.startsWith(prefix))
@@ -188,13 +239,14 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
       cores: Int,
       appId: String,
       workerUrl: Option[String],
-      userClassPath: mutable.ListBuffer[URL])
+      userClassPath: mutable.ListBuffer[URL],
+      resourcesFile: Option[String])
 
   def main(args: Array[String]): Unit = {
     val createFn: (RpcEnv, Arguments, SparkEnv) =>
       CoarseGrainedExecutorBackend = { case (rpcEnv, arguments, env) =>
       new CoarseGrainedExecutorBackend(rpcEnv, arguments.driverUrl, arguments.executorId,
-        arguments.hostname, arguments.cores, arguments.userClassPath, env)
+        arguments.hostname, arguments.cores, arguments.userClassPath, env, arguments.resourcesFile)
     }
     run(parseArguments(args, this.getClass.getCanonicalName.stripSuffix("$")), createFn)
     System.exit(0)
@@ -255,6 +307,7 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
     var executorId: String = null
     var hostname: String = null
     var cores: Int = 0
+    var resourcesFile: Option[String] = None
     var appId: String = null
     var workerUrl: Option[String] = None
     val userClassPath = new mutable.ListBuffer[URL]()
@@ -274,6 +327,9 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
         case ("--cores") :: value :: tail =>
           cores = value.toInt
           argv = tail
+        case ("--resourcesFile") :: value :: tail =>
+          resourcesFile = Some(value)
+          argv = tail
         case ("--app-id") :: value :: tail =>
           appId = value
           argv = tail
@@ -299,7 +355,7 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
     }
 
     Arguments(driverUrl, executorId, hostname, cores, appId, workerUrl,
-      userClassPath)
+      userClassPath, resourcesFile)
   }
 
   private def printUsageAndExit(classNameForEntry: String): Unit = {
@@ -313,6 +369,7 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
       |   --executor-id <executorId>
       |   --hostname <hostname>
       |   --cores <cores>
+      |   --resourcesFile <fileWithJSONResourceInformation>
       |   --app-id <appid>
       |   --worker-url <workerUrl>
       |   --user-class-path <url>

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -30,6 +30,13 @@ import org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader.MAX_
 
 package object config {
 
+  private[spark] val SPARK_DRIVER_RESOURCE_PREFIX = "spark.driver.resource."
+  private[spark] val SPARK_EXECUTOR_RESOURCE_PREFIX = "spark.executor.resource."
+  private[spark] val SPARK_TASK_RESOURCE_PREFIX = "spark.task.resource."
+
+  private[spark] val SPARK_RESOURCE_COUNT_POSTFIX = ".count"
+  private[spark] val SPARK_RESOURCE_DISCOVERY_SCRIPT_POSTFIX = ".discoveryScript"
+
   private[spark] val DRIVER_CLASS_PATH =
     ConfigBuilder(SparkLauncher.DRIVER_EXTRA_CLASSPATH).stringConf.createOptional
 

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -19,6 +19,7 @@ package org.apache.spark.scheduler.cluster
 
 import java.nio.ByteBuffer
 
+import org.apache.spark.ResourceInformation
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.scheduler.ExecutorLossReason
@@ -64,7 +65,8 @@ private[spark] object CoarseGrainedClusterMessages {
       hostname: String,
       cores: Int,
       logUrls: Map[String, String],
-      attributes: Map[String, String])
+      attributes: Map[String, String],
+      resources: Map[String, ResourceInformation])
     extends CoarseGrainedClusterMessage
 
   case class StatusUpdate(executorId: String, taskId: Long, state: TaskState,

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -185,7 +185,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
 
-      case RegisterExecutor(executorId, executorRef, hostname, cores, logUrls, attributes) =>
+      case RegisterExecutor(executorId, executorRef, hostname, cores, logUrls,
+          attributes, resources) =>
         if (executorDataMap.contains(executorId)) {
           executorRef.send(RegisterExecutorFailed("Duplicate executor ID: " + executorId))
           context.reply(true)

diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -174,9 +174,11 @@ class HeartbeatReceiverSuite
     val dummyExecutorEndpointRef1 = rpcEnv.setupEndpoint("fake-executor-1", dummyExecutorEndpoint1)
     val dummyExecutorEndpointRef2 = rpcEnv.setupEndpoint("fake-executor-2", dummyExecutorEndpoint2)
     fakeSchedulerBackend.driverEndpoint.askSync[Boolean](
-      RegisterExecutor(executorId1, dummyExecutorEndpointRef1, "1.2.3.4", 0, Map.empty, Map.empty))
+      RegisterExecutor(executorId1, dummyExecutorEndpointRef1, "1.2.3.4", 0, Map.empty, Map.empty,
+        Map.empty))
     fakeSchedulerBackend.driverEndpoint.askSync[Boolean](
-      RegisterExecutor(executorId2, dummyExecutorEndpointRef2, "1.2.3.5", 0, Map.empty, Map.empty))
+      RegisterExecutor(executorId2, dummyExecutorEndpointRef2, "1.2.3.5", 0, Map.empty, Map.empty,
+        Map.empty))
     heartbeatReceiverRef.askSync[Boolean](TaskSchedulerIsSet)
     addExecutorAndVerify(executorId1)
     addExecutorAndVerify(executorId2)