-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-27366][CORE] Support GPU Resources in Spark job scheduling #24374
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2494e77
46ceedd
f277572
3b5c5c7
d116591
9a2601f
5e36aa0
41e9440
d3f4a03
26126fc
bbac893
10d5fab
7844e5c
04fa380
dcc147e
cd01cae
e539097
82cd1e3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -507,6 +507,15 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Get task resource requirements. | ||
| */ | ||
| private[spark] def getTaskResourceRequirements(): Map[String, Int] = { | ||
|
||
| getAllWithPrefix(SPARK_TASK_RESOURCE_PREFIX) | ||
| .withFilter { case (k, v) => k.endsWith(SPARK_RESOURCE_COUNT_SUFFIX)} | ||
| .map { case (k, v) => (k.dropRight(SPARK_RESOURCE_COUNT_SUFFIX.length), v.toInt)}.toMap | ||
| } | ||
|
|
||
| /** | ||
| * Checks for illegal or deprecated config settings. Throws an exception for the former. Not | ||
| * idempotent - may mutate this conf object to convert deprecated settings to supported ones. | ||
|
|
@@ -603,30 +612,6 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria | |
| require(executorTimeoutThresholdMs > executorHeartbeatIntervalMs, "The value of " + | ||
| s"${networkTimeout}=${executorTimeoutThresholdMs}ms must be no less than the value of " + | ||
| s"${EXECUTOR_HEARTBEAT_INTERVAL.key}=${executorHeartbeatIntervalMs}ms.") | ||
|
|
||
| // Make sure the executor resources were specified and are large enough if | ||
| // any task resources were specified. | ||
| val taskResourcesAndCount = | ||
| getAllWithPrefixAndSuffix(SPARK_TASK_RESOURCE_PREFIX, SPARK_RESOURCE_COUNT_SUFFIX).toMap | ||
| val executorResourcesAndCounts = | ||
| getAllWithPrefixAndSuffix(SPARK_EXECUTOR_RESOURCE_PREFIX, SPARK_RESOURCE_COUNT_SUFFIX).toMap | ||
|
|
||
| taskResourcesAndCount.foreach { case (rName, taskCount) => | ||
| val execCount = executorResourcesAndCounts.get(rName).getOrElse( | ||
| throw new SparkException( | ||
| s"The executor resource config: " + | ||
| s"${SPARK_EXECUTOR_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} " + | ||
| "needs to be specified since a task requirement config: " + | ||
| s"${SPARK_TASK_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} was specified") | ||
| ) | ||
| if (execCount.toLong < taskCount.toLong) { | ||
| throw new SparkException( | ||
| s"The executor resource config: " + | ||
| s"${SPARK_EXECUTOR_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} " + | ||
| s"= $execCount has to be >= the task config: " + | ||
| s"${SPARK_TASK_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} = $taskCount") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2707,27 +2707,73 @@ object SparkContext extends Logging { | |
| // When running locally, don't try to re-execute tasks on failure. | ||
| val MAX_LOCAL_TASK_FAILURES = 1 | ||
|
|
||
| // SPARK-26340: Ensure that executor's core num meets at least one task requirement. | ||
| def checkCpusPerTask( | ||
| clusterMode: Boolean, | ||
| maxCoresPerExecutor: Option[Int]): Unit = { | ||
| val cpusPerTask = sc.conf.get(CPUS_PER_TASK) | ||
| if (clusterMode && sc.conf.contains(EXECUTOR_CORES)) { | ||
| if (sc.conf.get(EXECUTOR_CORES) < cpusPerTask) { | ||
| throw new SparkException(s"${CPUS_PER_TASK.key}" + | ||
| s" must be <= ${EXECUTOR_CORES.key} when run on $master.") | ||
| // Ensure that executor's resources satisfies one or more tasks requirement. | ||
| def checkResourcesPerTask(clusterMode: Boolean, executorCores: Option[Int]): Unit = { | ||
| val taskCores = sc.conf.get(CPUS_PER_TASK) | ||
| val execCores = if (clusterMode) { | ||
| executorCores.getOrElse(sc.conf.get(EXECUTOR_CORES)) | ||
| } else { | ||
| executorCores.get | ||
| } | ||
|
|
||
| // Number of cores per executor must meet at least one task requirement. | ||
| if (execCores < taskCores) { | ||
| throw new SparkException(s"The number of cores per executor (=$execCores) has to be >= " + | ||
| s"the task config: ${CPUS_PER_TASK.key} = $taskCores when run on $master.") | ||
| } | ||
|
|
||
| // Calculate the max slots each executor can provide based on resources available on each | ||
| // executor and resources required by each task. | ||
| val taskResourcesAndCount = sc.conf.getTaskResourceRequirements() | ||
| val executorResourcesAndCounts = sc.conf.getAllWithPrefixAndSuffix( | ||
| SPARK_EXECUTOR_RESOURCE_PREFIX, SPARK_RESOURCE_COUNT_SUFFIX).toMap | ||
| var numSlots = execCores / taskCores | ||
| var limitingResourceName = "CPU" | ||
| taskResourcesAndCount.foreach { case (rName, taskCount) => | ||
| // Make sure the executor resources were specified through config. | ||
| val execCount = executorResourcesAndCounts.getOrElse(rName, | ||
| throw new SparkException( | ||
| s"The executor resource config: " + | ||
| s"${SPARK_EXECUTOR_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} " + | ||
| "needs to be specified since a task requirement config: " + | ||
| s"${SPARK_TASK_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} was specified") | ||
| ) | ||
| // Make sure the executor resources are large enough to launch at least one task. | ||
| if (execCount.toLong < taskCount.toLong) { | ||
| throw new SparkException( | ||
| s"The executor resource config: " + | ||
| s"${SPARK_EXECUTOR_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} " + | ||
| s"= $execCount has to be >= the task config: " + | ||
| s"${SPARK_TASK_RESOURCE_PREFIX + rName + SPARK_RESOURCE_COUNT_SUFFIX} = $taskCount") | ||
|
||
| } | ||
| // Compare and update the max slots each executor can provide. | ||
| val resourceNumSlots = execCount.toInt / taskCount | ||
| if (resourceNumSlots < numSlots) { | ||
| numSlots = resourceNumSlots | ||
| limitingResourceName = rName | ||
| } | ||
| } else if (maxCoresPerExecutor.isDefined) { | ||
| if (maxCoresPerExecutor.get < cpusPerTask) { | ||
| throw new SparkException(s"Only ${maxCoresPerExecutor.get} cores available per executor" + | ||
| s" when run on $master, and ${CPUS_PER_TASK.key} must be <= it.") | ||
| } | ||
| // There have been checks above to make sure the executor resources were specified and are | ||
| // large enough if any task resources were specified. | ||
| taskResourcesAndCount.foreach { case (rName, taskCount) => | ||
| val execCount = executorResourcesAndCounts(rName) | ||
|
||
| if (taskCount.toInt * numSlots < execCount.toInt) { | ||
| val message = s"The configuration of resource: $rName (exec = ${execCount.toInt}, " + | ||
| s"task = ${taskCount}) will result in wasted resources due to resource " + | ||
| s"${limitingResourceName} limiting the number of runnable tasks per executor to: " + | ||
| s"${numSlots}. Please adjust your configuration." | ||
| if (Utils.isTesting) { | ||
| throw new SparkException(message) | ||
| } else { | ||
| logWarning(message) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. originally we talked about throwing here to not allow it, just want to make sure we intentionally changed our mind here? I'm really ok either way we go as there were some people questioning this on the Spip
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we now have
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I prefer a warning because the discovery script might return more and it is out of user's control. And available resources might not happen to be a multiple of task requested counts. For example, you have 32 CPU Cores and 3 GPUs. |
||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| master match { | ||
| case "local" => | ||
| checkCpusPerTask(clusterMode = false, Some(1)) | ||
| checkResourcesPerTask(clusterMode = false, Some(1)) | ||
| val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true) | ||
| val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1) | ||
| scheduler.initialize(backend) | ||
|
|
@@ -2740,7 +2786,7 @@ object SparkContext extends Logging { | |
| if (threadCount <= 0) { | ||
| throw new SparkException(s"Asked to run locally with $threadCount threads") | ||
| } | ||
| checkCpusPerTask(clusterMode = false, Some(threadCount)) | ||
| checkResourcesPerTask(clusterMode = false, Some(threadCount)) | ||
| val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true) | ||
| val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount) | ||
| scheduler.initialize(backend) | ||
|
|
@@ -2751,22 +2797,22 @@ object SparkContext extends Logging { | |
| // local[*, M] means the number of cores on the computer with M failures | ||
| // local[N, M] means exactly N threads with M failures | ||
| val threadCount = if (threads == "*") localCpuCount else threads.toInt | ||
| checkCpusPerTask(clusterMode = false, Some(threadCount)) | ||
| checkResourcesPerTask(clusterMode = false, Some(threadCount)) | ||
| val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true) | ||
| val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount) | ||
| scheduler.initialize(backend) | ||
| (backend, scheduler) | ||
|
|
||
| case SPARK_REGEX(sparkUrl) => | ||
| checkCpusPerTask(clusterMode = true, None) | ||
| checkResourcesPerTask(clusterMode = true, None) | ||
| val scheduler = new TaskSchedulerImpl(sc) | ||
| val masterUrls = sparkUrl.split(",").map("spark://" + _) | ||
| val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls) | ||
| scheduler.initialize(backend) | ||
| (backend, scheduler) | ||
|
|
||
| case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) => | ||
| checkCpusPerTask(clusterMode = true, Some(coresPerSlave.toInt)) | ||
| checkResourcesPerTask(clusterMode = true, Some(coresPerSlave.toInt)) | ||
| // Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang. | ||
| val memoryPerSlaveInt = memoryPerSlave.toInt | ||
| if (sc.executorMemory > memoryPerSlaveInt) { | ||
|
|
@@ -2787,7 +2833,7 @@ object SparkContext extends Logging { | |
| (backend, scheduler) | ||
|
|
||
| case masterUrl => | ||
| checkCpusPerTask(clusterMode = true, None) | ||
| checkResourcesPerTask(clusterMode = true, None) | ||
| val cm = getClusterManager(masterUrl) match { | ||
| case Some(clusterMgr) => clusterMgr | ||
| case None => throw new SparkException("Could not parse Master URL: '" + master + "'") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,7 @@ package org.apache.spark | |
| import java.io.Serializable | ||
| import java.util.Properties | ||
|
|
||
| import org.apache.spark.annotation.DeveloperApi | ||
| import org.apache.spark.annotation.{DeveloperApi, Evolving} | ||
| import org.apache.spark.executor.TaskMetrics | ||
| import org.apache.spark.memory.TaskMemoryManager | ||
| import org.apache.spark.metrics.source.Source | ||
|
|
@@ -176,6 +176,13 @@ abstract class TaskContext extends Serializable { | |
| */ | ||
| def getLocalProperty(key: String): String | ||
|
|
||
| /** | ||
| * Resources allocated to the task. The key is the resource name and the value is information | ||
| * about the resource. Please refer to [[ResourceInformation]] for specifics. | ||
| */ | ||
| @Evolving | ||
| def resources(): Map[String, ResourceInformation] | ||
|
||
|
|
||
| @DeveloperApi | ||
| def taskMetrics(): TaskMetrics | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.scheduler | ||
|
|
||
| import scala.collection.mutable | ||
|
|
||
| import org.apache.spark.SparkException | ||
| import org.apache.spark.util.collection.OpenHashMap | ||
|
|
||
| /** | ||
| * Class to hold information about a type of Resource on an Executor. This information is managed | ||
| * by SchedulerBackend, and TaskScheduler shall schedule tasks on idle Executors based on the | ||
| * information. | ||
| * Please note that this class is intended to be used in a single thread. | ||
| * @param name Resource name | ||
| * @param addresses Resource addresses provided by the executor | ||
| */ | ||
| private[spark] class ExecutorResourceInfo( | ||
| val name: String, | ||
| addresses: Seq[String]) extends Serializable { | ||
|
|
||
| /** | ||
| * Map from an address to its availability, the value `true` means the address is available, | ||
| * while value `false` means the address is assigned. | ||
| * TODO Use [[OpenHashMap]] instead to gain better performance. | ||
| */ | ||
| private val addressAvailabilityMap = mutable.HashMap(addresses.map(_ -> true): _*) | ||
|
|
||
| /** | ||
| * Sequence of currently available resource addresses. | ||
| */ | ||
| def availableAddrs: Seq[String] = addressAvailabilityMap.flatMap { case (addr, available) => | ||
| if (available) Some(addr) else None | ||
| }.toSeq | ||
|
|
||
| /** | ||
| * Sequence of currently assigned resource addresses. | ||
| * Exposed for testing only. | ||
| */ | ||
| private[scheduler] def assignedAddrs: Seq[String] = addressAvailabilityMap | ||
| .flatMap { case (addr, available) => | ||
| if (!available) Some(addr) else None | ||
| }.toSeq | ||
|
|
||
| /** | ||
| * Acquire a sequence of resource addresses (to a launched task), these addresses must be | ||
| * available. When the task finishes, it will return the acquired resource addresses. | ||
| * Throw an Exception if an address is not available or doesn't exist. | ||
| */ | ||
| def acquire(addrs: Seq[String]): Unit = { | ||
| addrs.foreach { address => | ||
| if (!addressAvailabilityMap.contains(address)) { | ||
| throw new SparkException(s"Try to acquire an address that doesn't exist. $name address " + | ||
| s"$address doesn't exist.") | ||
| } | ||
| val isAvailable = addressAvailabilityMap(address) | ||
| if (isAvailable) { | ||
| addressAvailabilityMap(address) = false | ||
| } else { | ||
| throw new SparkException(s"Try to acquire an address that is not available. $name " + | ||
| s"address $address is not available.") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Release a sequence of resource addresses, these addresses must have been assigned. Resource | ||
| * addresses are released when a task has finished. | ||
| * Throw an Exception if an address is not assigned or doesn't exist. | ||
| */ | ||
| def release(addrs: Seq[String]): Unit = { | ||
| addrs.foreach { address => | ||
| if (!addressAvailabilityMap.contains(address)) { | ||
| throw new SparkException(s"Try to release an address that doesn't exist. $name address " + | ||
| s"$address doesn't exist.") | ||
| } | ||
| val isAvailable = addressAvailabilityMap(address) | ||
| if (!isAvailable) { | ||
| addressAvailabilityMap(address) = true | ||
| } else { | ||
| throw new SparkException(s"Try to release an address that is not assigned. $name " + | ||
| s"address $address is not assigned.") | ||
| } | ||
| } | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we lost the check to make sure the executor resources are a multiple of task requirements, do you want to add that back?
note I added a check (https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/SparkConf.scala#L609) to make sure they were large enough but not that it was an exact fit
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will add it back later
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added back in SparkContext.checkResourcesPerTask()