-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-39853][CORE] Support stage level task resource profile for standalone cluster when dynamic allocation disabled #37268
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d528ac3
0afe9d2
5e456c8
5b2dbb4
844ef71
87726cf
9aae110
0530afc
e01aaae
fb6628d
c4c38fd
bc4a606
a7cc0c5
db47cd4
2fc57c8
31c7693
ed65f0d
0f559a4
21d01ce
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,7 @@ import javax.annotation.concurrent.GuardedBy | |
| import scala.collection.JavaConverters._ | ||
| import scala.collection.mutable | ||
|
|
||
| import org.apache.spark.{SparkConf, SparkContext, SparkException} | ||
| import org.apache.spark.{SparkConf, SparkContext, SparkEnv, SparkException} | ||
| import org.apache.spark.annotation.{Evolving, Since} | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.internal.config._ | ||
|
|
@@ -94,6 +94,15 @@ class ResourceProfile( | |
| executorResources.get(ResourceProfile.MEMORY).map(_.amount) | ||
| } | ||
|
|
||
| private[spark] def getCustomTaskResources(): Map[String, TaskResourceRequest] = { | ||
| taskResources.filterKeys(k => !k.equals(ResourceProfile.CPUS)).toMap | ||
| } | ||
|
|
||
| protected[spark] def getCustomExecutorResources(): Map[String, ExecutorResourceRequest] = { | ||
| executorResources. | ||
| filterKeys(k => !ResourceProfile.allSupportedExecutorResources.contains(k)).toMap | ||
| } | ||
|
|
||
| /* | ||
| * This function takes into account fractional amounts for the task resource requirement. | ||
| * Spark only supports fractional amounts < 1 to basically allow for multiple tasks | ||
|
|
@@ -182,8 +191,8 @@ class ResourceProfile( | |
| val numPartsPerResourceMap = new mutable.HashMap[String, Int] | ||
| numPartsPerResourceMap(ResourceProfile.CORES) = 1 | ||
| val taskResourcesToCheck = new mutable.HashMap[String, TaskResourceRequest] | ||
| taskResourcesToCheck ++= ResourceProfile.getCustomTaskResources(this) | ||
| val execResourceToCheck = ResourceProfile.getCustomExecutorResources(this) | ||
| taskResourcesToCheck ++= this.getCustomTaskResources() | ||
| val execResourceToCheck = this.getCustomExecutorResources() | ||
| execResourceToCheck.foreach { case (rName, execReq) => | ||
| val taskReq = taskResources.get(rName).map(_.amount).getOrElse(0.0) | ||
| numPartsPerResourceMap(rName) = 1 | ||
|
|
@@ -242,7 +251,8 @@ class ResourceProfile( | |
|
|
||
| // check that the task resources and executor resources are equal, but id's could be different | ||
| private[spark] def resourcesEqual(rp: ResourceProfile): Boolean = { | ||
| rp.taskResources == taskResources && rp.executorResources == executorResources | ||
| rp.taskResources == taskResources && rp.executorResources == executorResources && | ||
| rp.getClass == this.getClass | ||
| } | ||
|
|
||
| override def hashCode(): Int = Seq(taskResources, executorResources).hashCode() | ||
|
|
@@ -253,6 +263,40 @@ class ResourceProfile( | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Resource profile which only contains task resources, can be used for stage level task schedule | ||
| * when dynamic allocation is disabled, tasks will be scheduled to executors with default resource | ||
| * profile based on task resources described by this task resource profile. | ||
| * And when dynamic allocation is enabled, will require new executors for this profile based on | ||
| * the default executor resources requested at startup and assign tasks only on executors created | ||
| * with this resource profile. | ||
| * | ||
| * @param taskResources Resource requests for tasks. Mapped from the resource | ||
| * name (e.g., cores, memory, CPU) to its specific request. | ||
| */ | ||
| @Evolving | ||
| @Since("3.4.0") | ||
| private[spark] class TaskResourceProfile( | ||
| override val taskResources: Map[String, TaskResourceRequest]) | ||
| extends ResourceProfile(Map.empty, taskResources) { | ||
|
|
||
| override protected[spark] def getCustomExecutorResources() | ||
| : Map[String, ExecutorResourceRequest] = { | ||
| if (SparkEnv.get == null) { | ||
| // This will be called in standalone master when dynamic allocation enabled. | ||
| return super.getCustomExecutorResources() | ||
| } | ||
|
|
||
| val sparkConf = SparkEnv.get.conf | ||
| if (!Utils.isDynamicAllocationEnabled(sparkConf)) { | ||
| ResourceProfile.getOrCreateDefaultProfile(sparkConf) | ||
| .getCustomExecutorResources() | ||
| } else { | ||
| super.getCustomExecutorResources() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it is. And that's for compatibility when dynamic allocation is enabled. |
||
| } | ||
| } | ||
| } | ||
|
|
||
| object ResourceProfile extends Logging { | ||
| // task resources | ||
| /** | ||
|
|
@@ -393,17 +437,6 @@ object ResourceProfile extends Logging { | |
| } | ||
| } | ||
|
|
||
| private[spark] def getCustomTaskResources( | ||
| rp: ResourceProfile): Map[String, TaskResourceRequest] = { | ||
| rp.taskResources.filterKeys(k => !k.equals(ResourceProfile.CPUS)).toMap | ||
| } | ||
|
|
||
| private[spark] def getCustomExecutorResources( | ||
| rp: ResourceProfile): Map[String, ExecutorResourceRequest] = { | ||
| rp.executorResources. | ||
| filterKeys(k => !ResourceProfile.allSupportedExecutorResources.contains(k)).toMap | ||
| } | ||
|
|
||
| /* | ||
| * Get the number of cpus per task if its set in the profile, otherwise return the | ||
| * cpus per task for the default profile. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,35 +59,67 @@ private[spark] class ResourceProfileManager(sparkConf: SparkConf, | |
| private val testExceptionThrown = sparkConf.get(RESOURCE_PROFILE_MANAGER_TESTING) | ||
|
|
||
| /** | ||
| * If we use anything except the default profile, it's only supported on YARN and Kubernetes | ||
| * with dynamic allocation enabled. Throw an exception if not supported. | ||
| * If we use anything except the default profile, it's supported on YARN, Kubernetes and | ||
| * Standalone with dynamic allocation enabled, and task resource profile with dynamic allocation | ||
| * disabled on Standalone. Throw an exception if not supported. | ||
| */ | ||
| private[spark] def isSupported(rp: ResourceProfile): Boolean = { | ||
| val isNotDefaultProfile = rp.id != ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID | ||
| val notYarnOrK8sOrStandaloneAndNotDefaultProfile = | ||
| isNotDefaultProfile && !(isYarn || isK8s || isStandalone) | ||
| val YarnOrK8sOrStandaloneNotDynAllocAndNotDefaultProfile = | ||
| isNotDefaultProfile && (isYarn || isK8s || isStandalone) && !dynamicEnabled | ||
| // We want the exception to be thrown only when we are specifically testing for the | ||
| // exception or in a real application. Otherwise in all other testing scenarios we want | ||
| // to skip throwing the exception so that we can test in other modes to make testing easier. | ||
| if ((notRunningUnitTests || testExceptionThrown) && | ||
| if (rp.isInstanceOf[TaskResourceProfile] && !dynamicEnabled) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it mean
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey @Ngone51, thanks for the feedback, and for your concerns:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes it can be used with dynamic allocation, in that case it uses the default resource profile executor resources but it must acquire new executors. The TaskResourceProfile gets a unique rpid just like standard resource profile and it should go through the same path to get executors via dynamic allocation like a normal ResourceProfile (ie stage submitted kicks off). Is there something I'm not thinking about here?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So it should be the default resource profile executor resources but not the default rp id? Then, it makes sense to me.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, when dynamic allocation is enabled, it is just like a normal resource profile with a unique id, requesting executors based on default executor resources requirement. |
||
| if ((notRunningUnitTests || testExceptionThrown) && !isStandalone) { | ||
| throw new SparkException("TaskResourceProfiles are only supported for Standalone " + | ||
| "cluster for now when dynamic allocation is disabled.") | ||
| } | ||
| } else { | ||
| val isNotDefaultProfile = rp.id != ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID | ||
| val notYarnOrK8sOrStandaloneAndNotDefaultProfile = | ||
| isNotDefaultProfile && !(isYarn || isK8s || isStandalone) | ||
| val YarnOrK8sOrStandaloneNotDynAllocAndNotDefaultProfile = | ||
| isNotDefaultProfile && (isYarn || isK8s || isStandalone) && !dynamicEnabled | ||
|
|
||
| // We want the exception to be thrown only when we are specifically testing for the | ||
| // exception or in a real application. Otherwise in all other testing scenarios we want | ||
| // to skip throwing the exception so that we can test in other modes to make testing easier. | ||
| if ((notRunningUnitTests || testExceptionThrown) && | ||
| (notYarnOrK8sOrStandaloneAndNotDefaultProfile || | ||
| YarnOrK8sOrStandaloneNotDynAllocAndNotDefaultProfile)) { | ||
| throw new SparkException("ResourceProfiles are only supported on YARN and Kubernetes " + | ||
| "and Standalone with dynamic allocation enabled.") | ||
| } | ||
| throw new SparkException("ResourceProfiles are only supported on YARN and Kubernetes " + | ||
| "and Standalone with dynamic allocation enabled.") | ||
| } | ||
|
|
||
| if (isStandalone && rp.getExecutorCores.isEmpty && | ||
| sparkConf.getOption(config.EXECUTOR_CORES.key).isEmpty) { | ||
| logWarning("Neither executor cores is set for resource profile, nor spark.executor.cores " + | ||
| "is explicitly set, you may get more executors allocated than expected. It's recommended " + | ||
| "to set executor cores explicitly. Please check SPARK-30299 for more details.") | ||
| if (isStandalone && dynamicEnabled && rp.getExecutorCores.isEmpty && | ||
| sparkConf.getOption(config.EXECUTOR_CORES.key).isEmpty) { | ||
| logWarning("Neither executor cores is set for resource profile, nor spark.executor.cores " + | ||
| "is explicitly set, you may get more executors allocated than expected. " + | ||
| "It's recommended to set executor cores explicitly. " + | ||
| "Please check SPARK-30299 for more details.") | ||
| } | ||
| } | ||
|
|
||
| true | ||
| } | ||
|
|
||
| /** | ||
| * Check whether a task with specific taskRpId can be scheduled to executors | ||
| * with executorRpId. | ||
| * | ||
| * Here are the rules: | ||
| * 1. When dynamic allocation is disabled, only [[TaskResourceProfile]] is supported, | ||
| * and tasks with [[TaskResourceProfile]] can be scheduled to executors with default | ||
| * resource profile. | ||
| * 2. For other scenarios(when dynamic allocation is enabled), tasks can be scheduled to | ||
| * executors where resource profile exactly matches. | ||
| */ | ||
| private[spark] def canBeScheduled(taskRpId: Int, executorRpId: Int): Boolean = { | ||
| assert(resourceProfileIdToResourceProfile.contains(taskRpId) && | ||
| resourceProfileIdToResourceProfile.contains(executorRpId), | ||
| "Tasks and executors must have valid resource profile id") | ||
| val taskRp = resourceProfileFromId(taskRpId) | ||
|
|
||
| // When dynamic allocation disabled, tasks with TaskResourceProfile can always reuse | ||
| // all the executors with default resource profile. | ||
| taskRpId == executorRpId || (!dynamicEnabled && taskRp.isInstanceOf[TaskResourceProfile]) | ||
| } | ||
|
|
||
| def addResourceProfile(rp: ResourceProfile): Unit = { | ||
| isSupported(rp) | ||
| var putNewProfile = false | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.