-
Notifications
You must be signed in to change notification settings - Fork 118
Changes to support executor recovery behavior during static allocation. #244
Changes from 5 commits
a8831b7
c4b949f
d87d393
4dcb1b3
5a064ba
fbe4b18
4d60c3d
01e8ec7
5e1a143
1a579ce
608b08b
5cbea23
99b338d
57bb38b
2bc0ff4
e6bb8c2
b5bd8d1
1e2e49f
1131d2c
4e75491
8acefef
382278a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,14 +17,18 @@ | |
package org.apache.spark.scheduler.cluster.kubernetes | ||
|
||
import java.io.File | ||
import java.util.concurrent.{ThreadFactory, ThreadPoolExecutor} | ||
|
||
import com.google.common.base.Charsets | ||
import com.google.common.io.Files | ||
import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient} | ||
import io.fabric8.kubernetes.client.utils.HttpClientUtils | ||
import okhttp3.Dispatcher | ||
|
||
import org.apache.spark.SparkConf | ||
import org.apache.spark.deploy.kubernetes.config._ | ||
import org.apache.spark.deploy.kubernetes.constants._ | ||
import org.apache.spark.util.ThreadUtils | ||
|
||
private[spark] class KubernetesClientBuilder(sparkConf: SparkConf, namespace: String) { | ||
private val SERVICE_ACCOUNT_TOKEN = new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH) | ||
|
@@ -78,6 +82,15 @@ private[spark] class KubernetesClientBuilder(sparkConf: SparkConf, namespace: St | |
} | ||
serviceAccountConfigBuilder | ||
} | ||
new DefaultKubernetesClient(configBuilder.build) | ||
// Disable the ping thread that is not daemon, in order to allow | ||
// the driver main thread to shut down upon errors. Otherwise, the driver | ||
// will hang indefinitely. | ||
val config = configBuilder | ||
.withWebsocketPingInterval(0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we changing the websocket ping interval here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to work around a bug in the web socket ping thread, which is created as non-daemon thread and let the driver hang in case an exception is thrown in the driver main thread. See the comment at line 85 - 87. More details at PR 216 comment with the code snippet of how the web socket ping thread is created. |
||
.build() | ||
val httpClient = HttpClientUtils.createHttpClient(config).newBuilder() | ||
.dispatcher(new Dispatcher(ThreadUtils.newDaemonCachedThreadPool("spark-on-k8s"))) | ||
.build() | ||
new DefaultKubernetesClient(httpClient, config) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,30 +16,39 @@ | |
*/ | ||
package org.apache.spark.scheduler.cluster.kubernetes | ||
|
||
import java.util.concurrent.atomic.{AtomicInteger, AtomicLong} | ||
import java.io.Closeable | ||
import java.util.concurrent.TimeUnit | ||
import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference} | ||
|
||
import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder, | ||
EnvVarSourceBuilder, Pod, QuantityBuilder} | ||
import scala.collection.JavaConverters._ | ||
import scala.collection.mutable | ||
import scala.concurrent.{ExecutionContext, Future} | ||
|
||
import io.fabric8.kubernetes.api.model._ | ||
import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher} | ||
import io.fabric8.kubernetes.client.Watcher.Action | ||
|
||
import org.apache.spark.{SparkContext, SparkException} | ||
import org.apache.spark.deploy.kubernetes.config._ | ||
import org.apache.spark.deploy.kubernetes.constants._ | ||
import org.apache.spark.rpc.RpcEndpointAddress | ||
import org.apache.spark.scheduler.TaskSchedulerImpl | ||
import org.apache.spark.rpc.{RpcAddress, RpcEndpointAddress, RpcEnv} | ||
import org.apache.spark.scheduler._ | ||
import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend | ||
import org.apache.spark.util.{ThreadUtils, Utils} | ||
|
||
private[spark] class KubernetesClusterSchedulerBackend( | ||
scheduler: TaskSchedulerImpl, | ||
val sc: SparkContext) | ||
private[spark] class KubernetesClusterSchedulerBackend(scheduler: TaskSchedulerImpl, | ||
val sc: SparkContext) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The arguments indentation style doesn't adhere to scala convention. I'm guessing this is the IDE you're using. We should revert these unintended changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. addressed |
||
extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) { | ||
|
||
import KubernetesClusterSchedulerBackend._ | ||
|
||
private val EXECUTOR_MODIFICATION_LOCK = new Object | ||
private val runningExecutorPods = new scala.collection.mutable.HashMap[String, Pod] | ||
private val RUNNING_EXECUTOR_PODS_LOCK = new Object | ||
private val runningExecutorsToPods = new mutable.HashMap[String, Pod] // Indexed by executor IDs. | ||
private val runningPodsToExecutors = new mutable.HashMap[Pod, String] // Indexed by executor Pods. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we do without There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like I missed commenting on this. We do use it to see, if the pod has already exited. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like the keys can be pod names instead of pod objects as the only places where keys are used only refer to the pod names anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did this change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's worth commenting explicitly that both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
private val FAILED_PODS_LOCK = new Object | ||
private val failedPods = new mutable.HashMap[String, ExecutorLossReason] // Indexed by pod names. | ||
private val EXECUTORS_TO_REMOVE_LOCK = new Object | ||
private val executorsToRemove = new mutable.HashSet[String] | ||
|
||
private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE) | ||
private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE) | ||
|
@@ -68,8 +77,8 @@ private[spark] class KubernetesClusterSchedulerBackend( | |
private implicit val requestExecutorContext = ExecutionContext.fromExecutorService( | ||
ThreadUtils.newDaemonCachedThreadPool("kubernetes-executor-requests")) | ||
|
||
private val kubernetesClient = new DriverPodKubernetesClientProvider(conf, kubernetesNamespace) | ||
.get | ||
private val kubernetesClient = new KubernetesClientBuilder(conf, kubernetesNamespace) | ||
.buildFromWithinPod() | ||
|
||
private val driverPod = try { | ||
kubernetesClient.pods().inNamespace(kubernetesNamespace). | ||
|
@@ -80,21 +89,25 @@ private[spark] class KubernetesClusterSchedulerBackend( | |
throw new SparkException(s"Executor cannot find driver pod", throwable) | ||
} | ||
|
||
override val minRegisteredRatio = | ||
override val minRegisteredRatio: Double = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) { | ||
0.8 | ||
} else { | ||
super.minRegisteredRatio | ||
} | ||
|
||
private val executorWatchResource = new AtomicReference[Closeable] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be of type |
||
private val executorCleanupScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor( | ||
"executor-recovery-worker") | ||
protected var totalExpectedExecutors = new AtomicInteger(0) | ||
|
||
|
||
private val driverUrl = RpcEndpointAddress( | ||
sc.getConf.get("spark.driver.host"), | ||
sc.getConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT), | ||
CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString | ||
|
||
private val initialExecutors = getInitialTargetExecutorNumber(1) | ||
private val initialExecutors = getInitialTargetExecutorNumber() | ||
|
||
private def getInitialTargetExecutorNumber(defaultNumExecutors: Int = 1): Int = { | ||
if (Utils.isDynamicAllocationEnabled(conf)) { | ||
|
@@ -119,29 +132,39 @@ private[spark] class KubernetesClusterSchedulerBackend( | |
|
||
override def start(): Unit = { | ||
super.start() | ||
executorWatchResource.set(kubernetesClient.pods().withLabel(SPARK_APP_ID_LABEL, applicationId()) | ||
.watch(new ExecutorPodsWatcher())) | ||
if (!Utils.isDynamicAllocationEnabled(sc.conf)) { | ||
doRequestTotalExecutors(initialExecutors) | ||
} | ||
executorCleanupScheduler.scheduleWithFixedDelay(executorRecoveryRunnable, 0, | ||
TimeUnit.SECONDS.toMillis(5), TimeUnit.MILLISECONDS) | ||
} | ||
|
||
override def stop(): Unit = { | ||
// send stop message to executors so they shut down cleanly | ||
super.stop() | ||
|
||
// then delete the executor pods | ||
// TODO investigate why Utils.tryLogNonFatalError() doesn't work in this context. | ||
// When using Utils.tryLogNonFatalError some of the code fails but without any logs or | ||
// indication as to why. | ||
try { | ||
runningExecutorPods.values.foreach(kubernetesClient.pods().delete(_)) | ||
RUNNING_EXECUTOR_PODS_LOCK.synchronized { | ||
runningExecutorsToPods.values.foreach(kubernetesClient.pods().delete(_)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. |
||
runningPodsToExecutors.clear() | ||
} | ||
val resource = executorWatchResource.getAndSet(null) | ||
if (resource != null) { | ||
resource.close() | ||
} | ||
} catch { | ||
case e: Throwable => logError("Uncaught exception while shutting down controllers.", e) | ||
} | ||
try { | ||
logInfo("Closing kubernetes client") | ||
kubernetesClient.close() | ||
} catch { | ||
case e: Throwable => logError("Uncaught exception closing Kubernetes client.", e) | ||
} | ||
executorCleanupScheduler.shutdown() | ||
super.stop() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be called in a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just realized we don't need this..redundant. |
||
} | ||
|
||
private def allocateNewExecutorPod(): (String, Pod) = { | ||
|
@@ -231,13 +254,17 @@ private[spark] class KubernetesClusterSchedulerBackend( | |
} | ||
|
||
override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future[Boolean] { | ||
EXECUTOR_MODIFICATION_LOCK.synchronized { | ||
RUNNING_EXECUTOR_PODS_LOCK.synchronized { | ||
if (requestedTotal > totalExpectedExecutors.get) { | ||
logInfo(s"Requesting ${requestedTotal - totalExpectedExecutors.get}" | ||
logInfo(s"Requesting ${ | ||
requestedTotal - totalExpectedExecutors.get | ||
}" | ||
+ s" additional executors, expecting total $requestedTotal and currently" + | ||
s" expected ${totalExpectedExecutors.get}") | ||
for (i <- 0 until (requestedTotal - totalExpectedExecutors.get)) { | ||
runningExecutorPods += allocateNewExecutorPod() | ||
val (executorId, pod) = allocateNewExecutorPod() | ||
runningExecutorsToPods.put(executorId, pod) | ||
runningPodsToExecutors.put(pod, executorId) | ||
} | ||
} | ||
totalExpectedExecutors.set(requestedTotal) | ||
|
@@ -246,19 +273,185 @@ private[spark] class KubernetesClusterSchedulerBackend( | |
} | ||
|
||
override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = Future[Boolean] { | ||
EXECUTOR_MODIFICATION_LOCK.synchronized { | ||
RUNNING_EXECUTOR_PODS_LOCK.synchronized { | ||
for (executor <- executorIds) { | ||
runningExecutorPods.remove(executor) match { | ||
case Some(pod) => kubernetesClient.pods().delete(pod) | ||
runningExecutorsToPods.remove(executor) match { | ||
case Some(pod) => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is some scala community debate about whether There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Strongly prefer not to match on Options anywhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks..this is addressed now. |
||
kubernetesClient.pods().delete(pod) | ||
runningPodsToExecutors.remove(pod) | ||
case None => logWarning(s"Unable to remove pod for unknown executor $executor") | ||
} | ||
} | ||
} | ||
true | ||
} | ||
|
||
private class ExecutorPodsWatcher extends Watcher[Pod] { | ||
|
||
private val DEFAULT_CONTAINER_FAILURE_EXIT_STATUS = -1 | ||
|
||
override def eventReceived(action: Action, pod: Pod): Unit = { | ||
if (action == Action.ERROR) { | ||
val podName = pod.getMetadata.getName | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: instead of creating this val, we can directly write:
|
||
logInfo(s"Received pod $podName exited event. Reason: " + pod.getStatus.getReason) | ||
handleErroredPod(pod) | ||
} | ||
else if (action == Action.DELETED) { | ||
val podName = pod.getMetadata.getName | ||
logInfo(s"Received delete pod $podName event. Reason: " + pod.getStatus.getReason) | ||
handleDeletedPod(pod) | ||
} | ||
} | ||
|
||
override def onClose(cause: KubernetesClientException): Unit = { | ||
logDebug("Executor pod watch closed.", cause) | ||
} | ||
|
||
def getContainerExitStatus(pod: Pod): Int = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be renamed to getExecutorExitStatus? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
val containerStatuses = pod.getStatus.getContainerStatuses.asScala | ||
for (containerStatus <- containerStatuses) { | ||
return getContainerExitStatus(containerStatus) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we're always returning the first container's exit status, we can avoid the loop here and perhaps just fetch the status directly? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. addressed. |
||
} | ||
DEFAULT_CONTAINER_FAILURE_EXIT_STATUS | ||
} | ||
|
||
def getContainerExitStatus(containerStatus: ContainerStatus): Int = { | ||
containerStatus.getState.getTerminated.getExitCode.intValue | ||
} | ||
|
||
def handleErroredPod(pod: Pod): Unit = { | ||
def isPodAlreadyReleased(pod: Pod): Boolean = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid inner method definitions - move this outside somewhere There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
RUNNING_EXECUTOR_PODS_LOCK.synchronized { | ||
runningPodsToExecutors.keySet.foreach(runningPod => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be simplified as a key existence check if the keys are pod names. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. indeed..done. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need the for loop here. This can be as simple as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. got rid of the loop. |
||
if (runningPod.getMetadata.getName == pod.getMetadata.getName) { | ||
return false | ||
} | ||
) | ||
} | ||
true | ||
} | ||
val alreadyReleased = isPodAlreadyReleased(pod) | ||
val containerExitStatus = getContainerExitStatus(pod) | ||
// container was probably actively killed by the driver. | ||
val exitReason = if (alreadyReleased) { | ||
ExecutorExited(containerExitStatus, exitCausedByApp = false, | ||
s"Container in pod " + pod.getMetadata.getName + | ||
" exited from explicit termination request.") | ||
} else { | ||
val containerExitReason = containerExitStatus match { | ||
case VMEM_EXCEEDED_EXIT_CODE | PMEM_EXCEEDED_EXIT_CODE => | ||
memLimitExceededLogMessage(pod.getStatus.getReason) | ||
case _ => | ||
// Here we can't be sure that that exit was caused by the application but this seems | ||
// to be the right default since we know the pod was not explicitly deleted by | ||
// the user. | ||
"Pod exited with following container exit status code " + containerExitStatus | ||
} | ||
ExecutorExited(containerExitStatus, exitCausedByApp = true, containerExitReason) | ||
} | ||
FAILED_PODS_LOCK.synchronized { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mccheah is there a concurrent lock-free version of the map that we can use that doesn't need locking everywhere? like - There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 for this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks..Made failedPods map a concurrent hash map. It is not truly lock free as there are implicit locks but explicit locking by the user is not required. |
||
failedPods.put(pod.getMetadata.getName, exitReason) | ||
} | ||
} | ||
|
||
def handleDeletedPod(pod: Pod): Unit = { | ||
val exitReason = ExecutorExited(getContainerExitStatus(pod), exitCausedByApp = false, | ||
"Pod " + pod.getMetadata.getName + " deleted by K8s master") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need not necessarily deleted by the master. Maybe we can say - There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. addressed |
||
FAILED_PODS_LOCK.synchronized { | ||
failedPods.put(pod.getMetadata.getName, exitReason) | ||
} | ||
} | ||
} | ||
|
||
override def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = { | ||
new KubernetesDriverEndpoint(rpcEnv, properties) | ||
} | ||
|
||
private class KubernetesDriverEndpoint(rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)]) | ||
extends DriverEndpoint(rpcEnv, sparkProperties) { | ||
|
||
override def onDisconnected(rpcAddress: RpcAddress): Unit = { | ||
addressToExecutorId.get(rpcAddress).foreach { executorId => | ||
if (disableExecutor(executorId)) { | ||
EXECUTORS_TO_REMOVE_LOCK.synchronized { | ||
executorsToRemove.add(executorId) | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
private val executorRecoveryRunnable: Runnable = new Runnable { | ||
|
||
private val MAX_EXECUTOR_LOST_REASON_CHECKS = 10 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm guessing we want this knob to be something the user controls, via some spark property? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This knob need not be controlled by user. It is a very specific knob tied to implementation, and users shouldn't be worrying about tuning this, I think. |
||
private val executorsToRecover = new mutable.HashSet[String] | ||
private val executorReasonChecks = new mutable.HashMap[String, Int] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please add a comment here explaining what There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
||
override def run(): Unit = removeFailedAndRequestNewExecutors() | ||
|
||
def removeFailedAndRequestNewExecutors(): Unit = { | ||
val localRunningExecutorsToPods = RUNNING_EXECUTOR_PODS_LOCK.synchronized { | ||
runningExecutorsToPods.toMap | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this create a copy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes an immutable copy. |
||
} | ||
val localFailedPods = FAILED_PODS_LOCK.synchronized { | ||
failedPods.toMap | ||
} | ||
val localExecutorsToRemove = EXECUTORS_TO_REMOVE_LOCK.synchronized { | ||
executorsToRemove.toSet | ||
} | ||
localExecutorsToRemove.foreach { case (executorId) => | ||
localRunningExecutorsToPods.get(executorId) match { | ||
case Some(pod) => | ||
localFailedPods.get(pod.getMetadata.getName) match { | ||
case Some(executorExited: ExecutorExited) => | ||
logDebug(s"Removing executor $executorId with loss reason " | ||
+ executorExited.message) | ||
removeExecutor(executorId, executorExited) | ||
if (!executorExited.exitCausedByApp) { | ||
executorsToRecover.add(executorId) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you want to update the PR description with this code snippet saying this is the main business logic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
} | ||
case None => | ||
removeExecutorOrIncrementLossReasonCheckCount(executorId) | ||
} | ||
case None => | ||
removeExecutorOrIncrementLossReasonCheckCount(executorId) | ||
} | ||
} | ||
executorsToRecover.foreach(executorId => | ||
EXECUTORS_TO_REMOVE_LOCK.synchronized { | ||
executorsToRemove -= executorId | ||
executorReasonChecks -= executorId | ||
} | ||
) | ||
if (executorsToRecover.nonEmpty) { | ||
requestExecutors(executorsToRecover.size) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Compute numExecutorsToRecover as Math.min(executorsToRecover.size, MAX_ALLOWED_EXECUTOR_RECOVERY_ATTEMPTS - recoveredExecutorCount) and pass it to requestExecutors so that we don't go above the max? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto..dropping this code. |
||
} | ||
executorsToRecover.clear() | ||
} | ||
|
||
|
||
def removeExecutorOrIncrementLossReasonCheckCount(executorId: String): Unit = { | ||
val reasonCheckCount = executorReasonChecks.getOrElse(executorId, 0) | ||
if (reasonCheckCount > MAX_EXECUTOR_LOST_REASON_CHECKS) { | ||
removeExecutor(executorId, SlaveLost("Executor lost for unknown reasons")) | ||
executorsToRecover.add(executorId) | ||
executorReasonChecks -= executorId | ||
} else { | ||
executorReasonChecks.put(executorId, reasonCheckCount + 1) | ||
} | ||
} | ||
} | ||
} | ||
|
||
private object KubernetesClusterSchedulerBackend { | ||
private val DEFAULT_STATIC_PORT = 10000 | ||
private val EXECUTOR_ID_COUNTER = new AtomicLong(0L) | ||
private val VMEM_EXCEEDED_EXIT_CODE = -103 | ||
private val PMEM_EXCEEDED_EXIT_CODE = -104 | ||
|
||
def memLimitExceededLogMessage(diagnostics: String): String = { | ||
s"Pod/Container killed for exceeding memory limits.$diagnostics" + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: space after period. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
" Consider boosting spark executor memory overhead." | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unused?