Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@
package org.apache.spark.deploy.k8s.submit

import java.io.StringWriter
import java.net.HttpURLConnection.HTTP_GONE
import java.util.{Collections, UUID}
import java.util.Properties

import io.fabric8.kubernetes.api.model._
import io.fabric8.kubernetes.client.KubernetesClient
import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watch}
import io.fabric8.kubernetes.client.Watcher.Action
import scala.collection.mutable
import scala.util.control.Breaks._
import scala.util.control.NonFatal

import org.apache.spark.SparkConf
Expand Down Expand Up @@ -133,29 +136,38 @@ private[spark] class Client(
.endVolume()
.endSpec()
.build()
Utils.tryWithResource(
kubernetesClient
.pods()
.withName(resolvedDriverPod.getMetadata.getName)
.watch(watcher)) { _ =>
val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
try {
val otherKubernetesResources =
resolvedDriverSpec.driverKubernetesResources ++ Seq(configMap)
addDriverOwnerReference(createdDriverPod, otherKubernetesResources)
kubernetesClient.resourceList(otherKubernetesResources: _*).createOrReplace()
} catch {
case NonFatal(e) =>
kubernetesClient.pods().delete(createdDriverPod)
throw e
}

if (waitForAppCompletion) {
logInfo(s"Waiting for application $appName to finish...")
watcher.awaitCompletion()
logInfo(s"Application $appName finished.")
} else {
logInfo(s"Deployed Spark application $appName into Kubernetes.")
val driverPodName = resolvedDriverPod.getMetadata.getName
var watch: Watch = null
val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
try {
val otherKubernetesResources = resolvedDriverSpec.driverKubernetesResources ++ Seq(configMap)
addDriverOwnerReference(createdDriverPod, otherKubernetesResources)
kubernetesClient.resourceList(otherKubernetesResources: _*).createOrReplace()
} catch {
case NonFatal(e) =>
kubernetesClient.pods().delete(createdDriverPod)
throw e
}
val sId = Seq(kubernetesConf.namespace(), driverPodName).mkString(":")
breakable {
while (true) {
val podWithName = kubernetesClient
.pods()
.withName(driverPodName)
// Reset resource to old before we start the watch, this is important for race conditions
watcher.reset()

Copy link
Member

@dongjoon-hyun dongjoon-hyun Nov 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's remove this empty line like the other branch.

watch = podWithName.watch(watcher)

// Send the latest pod state we know to the watcher to make sure we didn't miss anything
watcher.eventReceived(Action.MODIFIED, podWithName.get())

// Break the while loop if the pod is completed or we don't want to wait
if(watcher.watchOrStop(sId)) {
watch.close()
break
}
}
}
}
Expand Down Expand Up @@ -230,7 +242,9 @@ private[spark] class KubernetesClientApplication extends SparkApplication {
val master = KubernetesUtils.parseMasterUrl(sparkConf.get("spark.master"))
val loggingInterval = if (waitForAppCompletion) Some(sparkConf.get(REPORT_INTERVAL)) else None

val watcher = new LoggingPodStatusWatcherImpl(kubernetesAppId, loggingInterval)
val watcher = new LoggingPodStatusWatcherImpl(kubernetesAppId,
loggingInterval,
waitForAppCompletion)
Copy link
Member

@dongjoon-hyun dongjoon-hyun Nov 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please revert this change. This is inconsistent with Apache Spark 3.1 and 3.0.


Utils.tryWithResource(SparkKubernetesClientFactory.createKubernetesClient(
master,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/
package org.apache.spark.deploy.k8s.submit

import java.net.HttpURLConnection.HTTP_GONE
import java.util.concurrent.{CountDownLatch, TimeUnit}

import scala.collection.JavaConverters._
Expand All @@ -29,7 +30,8 @@ import org.apache.spark.internal.Logging
import org.apache.spark.util.ThreadUtils

private[k8s] trait LoggingPodStatusWatcher extends Watcher[Pod] {
def awaitCompletion(): Unit
def watchOrStop(submissionId: String): Boolean
def reset(): Unit
}

/**
Expand All @@ -42,13 +44,20 @@ private[k8s] trait LoggingPodStatusWatcher extends Watcher[Pod] {
*/
private[k8s] class LoggingPodStatusWatcherImpl(
appId: String,
maybeLoggingInterval: Option[Long])
maybeLoggingInterval: Option[Long],
waitForCompletion: Boolean)
Copy link
Member

@dongjoon-hyun dongjoon-hyun Nov 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please revert this change. This is inconsistent with Apache Spark 3.1 and 3.0.

extends LoggingPodStatusWatcher with Logging {

private var podCompleted = false

private var resourceTooOldReceived: Boolean = false

private val podCompletedFuture = new CountDownLatch(1)

// start timer for periodic logging
private val scheduler =
ThreadUtils.newDaemonSingleThreadScheduledExecutor("logging-pod-status-watcher")

private val logRunnable: Runnable = new Runnable {
override def run() = logShortStatus()
}
Expand All @@ -57,6 +66,10 @@ private[k8s] class LoggingPodStatusWatcherImpl(

private def phase: String = pod.map(_.getStatus.getPhase).getOrElse("unknown")

override def reset(): Unit = {
resourceTooOldReceived = false
}

def start(): Unit = {
maybeLoggingInterval.foreach { interval =>
scheduler.scheduleAtFixedRate(logRunnable, 0, interval, TimeUnit.MILLISECONDS)
Expand All @@ -79,7 +92,12 @@ private[k8s] class LoggingPodStatusWatcherImpl(

override def onClose(e: KubernetesClientException): Unit = {
logDebug(s"Stopping watching application $appId with last-observed phase $phase")
closeWatch()
if (e != null && e.getCode == HTTP_GONE) {
resourceTooOldReceived = true
logDebug(s"Got HTTP Gone code, resource version changed in k8s api: $e")
} else {
closeWatch()
}
}

private def logShortStatus() = {
Expand All @@ -97,6 +115,7 @@ private[k8s] class LoggingPodStatusWatcherImpl(
private def closeWatch(): Unit = {
podCompletedFuture.countDown()
scheduler.shutdown()
podCompleted = true
}

private def formatPodState(pod: Pod): String = {
Expand Down Expand Up @@ -134,13 +153,6 @@ private[k8s] class LoggingPodStatusWatcherImpl(
}.mkString("")
}

override def awaitCompletion(): Unit = {
podCompletedFuture.await()
logInfo(pod.map { p =>
s"Container final statuses:\n\n${containersDescription(p)}"
}.getOrElse("No containers were found in the driver pod."))
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This removal looks like a part of independency PR instead of the part of SPARK-24266. Could you tell us why this is required and where this came from?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shockdm Could you chime in on this one?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dongjoon-hyun It does look like it originates from SPARK-28947 02c5b4f which eliminated the future and was a rename.

Since this is a private trait, the logic should be completely self-contained and safe to remove.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jkleckner thank you for following up, that is correct. Sorry for the late response :(

private def containersDescription(p: Pod): String = {
p.getStatus.getContainerStatuses.asScala.map { status =>
Seq(
Expand Down Expand Up @@ -177,4 +189,28 @@ private[k8s] class LoggingPodStatusWatcherImpl(
private def formatTime(time: String): String = {
if (time != null || time != "") time else "N/A"
}

override def watchOrStop(sId: String): Boolean = if (waitForCompletion) {
logInfo(s"Waiting for application ${appId} with submission ID $sId to finish...")
val interval = maybeLoggingInterval

synchronized {
while (!podCompleted && !resourceTooOldReceived) {
wait(interval.get)
logInfo(s"Application status for $appId (phase: $phase)")
}
}

if(podCompleted) {
logInfo(
pod.map { p => s"Container final statuses:\n\n${containersDescription(p)}" }
.getOrElse("No containers were found in the driver pod."))
logInfo(s"Application ${appId} with submission ID $sId finished")
}
podCompleted
} else {
logInfo(s"Deployed Spark application ${appId} with submission ID $sId into Kubernetes")
// Always act like the application has completed since we don't want to wait for app completion
true
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ class ClientSuite extends SparkFunSuite with BeforeAndAfter {
createdResourcesArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
when(podOperations.create(FULL_EXPECTED_POD)).thenReturn(POD_WITH_OWNER_REFERENCE)
when(namedPods.watch(loggingPodStatusWatcher)).thenReturn(mock[Watch])
when(loggingPodStatusWatcher.watchOrStop(kubernetesConf.namespace() + ":" + POD_NAME))
.thenReturn(true)
doReturn(resourceList)
.when(kubernetesClient)
.resourceList(createdResourcesArgumentCaptor.capture())
Expand Down Expand Up @@ -205,6 +207,6 @@ class ClientSuite extends SparkFunSuite with BeforeAndAfter {
loggingPodStatusWatcher,
KUBERNETES_RESOURCE_PREFIX)
submissionClient.run()
verify(loggingPodStatusWatcher).awaitCompletion()
verify(loggingPodStatusWatcher).watchOrStop(kubernetesConf.namespace + ":driver")
}
}