apache · holdenk · Jul 9, 2021 · Jul 9, 2021 · Jul 9, 2021 · Jul 9, 2021
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -61,7 +61,7 @@ private[spark] class CoarseGrainedExecutorBackend(
 
   private implicit val formats = DefaultFormats
 
-  private[executor] val stopping = new AtomicBoolean(false)
+  private[spark] val stopping = new AtomicBoolean(false)
   var executor: Executor = null
   @volatile var driver: Option[RpcEndpointRef] = None
 

diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -59,7 +59,14 @@ private[spark] class DiskStore(
    */
   def put(blockId: BlockId)(writeFunc: WritableByteChannel => Unit): Unit = {
     if (contains(blockId)) {
-      throw new IllegalStateException(s"Block $blockId is already present in the disk store")
+      logWarning(s"Block $blockId is already present in the disk store")
+      try {
+        diskManager.getFile(blockId).delete()
+      } catch {
+        case e: Exception =>
+          throw new IllegalStateException(
+            s"Block $blockId is already present in the disk store and could not delete it $e")
+      }
     }
     logDebug(s"Attempting to put block $blockId")
     val startTimeNs = System.nanoTime()

diff --git a/examples/src/main/scala/org/apache/spark/examples/MiniReadWriteTest.scala b/examples/src/main/scala/org/apache/spark/examples/MiniReadWriteTest.scala
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples
+
+import java.io.File
+import java.io.PrintWriter
+
+import scala.io.Source._
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.util.Utils
+
+/**
+ * Simple test for reading and writing to a distributed
+ * file system.  This example does the following:
+ *
+ *   1. Reads local file
+ *   2. Computes word count on local file
+ *   3. Writes local file to a local dir on each executor
+ *   4. Reads the file back from each exec
+ *   5. Computes word count on the file using Spark
+ *   6. Compares the word count results
+ */
+object MiniReadWriteTest {
+
+  private val NPARAMS = 1
+
+  private def readFile(filename: String): List[String] = {
+    Utils.tryWithResource(fromFile(filename))(_.getLines().toList)
+  }
+
+  private def printUsage(): Unit = {
+    val usage = """Mini Read-Write Test
+    |Usage: localFile
+    |localFile - (string) location of local file to distribute to executors.""".stripMargin
+
+    println(usage)
+  }
+
+  private def parseArgs(args: Array[String]): File = {
+    if (args.length != NPARAMS) {
+      printUsage()
+      System.exit(1)
+    }
+
+    var i = 0
+
+    val localFilePath = new File(args(i))
+    if (!localFilePath.exists) {
+      System.err.println(s"Given path (${args(i)}) does not exist")
+      printUsage()
+      System.exit(1)
+    }
+
+    if (!localFilePath.isFile) {
+      System.err.println(s"Given path (${args(i)}) is not a file")
+      printUsage()
+      System.exit(1)
+    }
+    localFilePath
+  }
+
+  def runLocalWordCount(fileContents: List[String]): Int = {
+    fileContents.flatMap(_.split(" "))
+      .flatMap(_.split("\t"))
+      .filter(_.nonEmpty)
+      .groupBy(w => w)
+      .mapValues(_.size)
+      .values
+      .sum
+  }
+
+  def main(args: Array[String]): Unit = {
+    val localFilePath = parseArgs(args)
+
+    println(s"Performing local word count from ${localFilePath}")
+    val fileContents = readFile(localFilePath.toString())
+    println(s"File contents are ${fileContents}")
+    val localWordCount = runLocalWordCount(fileContents)
+
+    println("Creating SparkSession")
+    val spark = SparkSession
+      .builder
+      .appName("Mini Read Write Test")
+      .getOrCreate()
+
+    println("Writing local file to executors")
+
+    // uses the fact default parallelism is greater than num execs
+    val misc = spark.sparkContext.parallelize(1.to(10))
+    misc.foreachPartition {
+      x =>
+        new PrintWriter(localFilePath) {
+          try {
+            write(fileContents.mkString("\n"))
+          } finally {
+            close()
+          }}
+    }
+
+    println("Reading file from execs and running Word Count")
+    val readFileRDD = spark.sparkContext.textFile(localFilePath.toString())
+
+    val dWordCount = readFileRDD
+      .flatMap(_.split(" "))
+      .flatMap(_.split("\t"))
+      .filter(_.nonEmpty)
+      .map(w => (w, 1))
+      .countByKey()
+      .values
+      .sum
+
+    spark.stop()
+    if (localWordCount == dWordCount) {
+      println(s"Success! Local Word Count $localWordCount and " +
+        s"D Word Count $dWordCount agree.")
+    } else {
+      println(s"Failure! Local Word Count $localWordCount " +
+        s"and D Word Count $dWordCount disagree.")
+    }
+  }
+}
+// scalastyle:on println
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -323,6 +323,16 @@ private[spark] object Config extends Logging {
       .stringConf
       .createOptional
 
+  val KUBERNETES_ALLOCATION_PODS_ALLOCATOR =
+    ConfigBuilder("spark.kubernetes.allocation.pods.allocator")
+      .doc("Allocator to use for pods. Possible values are direct (the default) and statefulset " +
+        ", or a full class name of a class implementing AbstractPodsAllocator. " +
+        "Future version may add Job or replicaset. This is a developer API and may change " +
+      "or be removed at anytime.")
+      .version("3.3.0")
+      .stringConf
+      .createWithDefault("direct")
+
   val KUBERNETES_ALLOCATION_BATCH_SIZE =
     ConfigBuilder("spark.kubernetes.allocation.batch.size")
       .doc("Number of pods to launch at once in each round of executor allocation.")

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
@@ -59,6 +59,7 @@ private[spark] object Constants {
   val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
   val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
   val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
+  val ENV_EXECUTOR_POD_NAME = "SPARK_EXECUTOR_POD_NAME"
   val ENV_JAVA_OPT_PREFIX = "SPARK_JAVA_OPT_"
   val ENV_CLASSPATH = "SPARK_CLASSPATH"
   val ENV_DRIVER_BIND_ADDRESS = "SPARK_DRIVER_BIND_ADDRESS"

diff --git a/...s/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala b/...s/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala
@@ -138,6 +138,13 @@ private[spark] class BasicExecutorFeatureStep(
             .withNewFieldRef("v1", "status.podIP")
             .build())
           .build())
+      } ++ {
+        Seq(new EnvVarBuilder()
+          .withName(ENV_EXECUTOR_POD_NAME)
+          .withValueFrom(new EnvVarSourceBuilder()
+            .withNewFieldRef("v1", "metadata.name")
+            .build())
+          .build())
       } ++ {
         if (kubernetesConf.get(AUTH_SECRET_FILE_EXECUTOR).isEmpty) {
           Option(secMgr.getSecretKey()).map { authSecret =>
@@ -260,16 +267,22 @@ private[spark] class BasicExecutorFeatureStep(
         .withUid(pod.getMetadata.getUid)
         .build()
     }
+
+    val policy = kubernetesConf.get(KUBERNETES_ALLOCATION_PODS_ALLOCATOR) match {
+      case "statefulset" => "Always"
+      case _ => "Never"
+    }
     val executorPodBuilder = new PodBuilder(pod.pod)
       .editOrNewMetadata()
         .withName(name)
         .addToLabels(kubernetesConf.labels.asJava)
+        .addToLabels(SPARK_RESOURCE_PROFILE_ID_LABEL, resourceProfile.id.toString)
         .addToAnnotations(kubernetesConf.annotations.asJava)
         .addToOwnerReferences(ownerReference.toSeq: _*)
         .endMetadata()
       .editOrNewSpec()
         .withHostname(hostname)
-        .withRestartPolicy("Never")
+        .withRestartPolicy(policy)
         .addToNodeSelector(kubernetesConf.nodeSelector.asJava)
         .addToNodeSelector(kubernetesConf.executorNodeSelector.asJava)
         .addToImagePullSecrets(kubernetesConf.imagePullSecrets: _*)

diff --git a/...es/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/AbstractPodsAllocator.scala b/...es/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/AbstractPodsAllocator.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.k8s
+
+import io.fabric8.kubernetes.api.model.Pod
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.resource.ResourceProfile
+
+
+/**
+ * :: DeveloperApi ::
+ * A abstract interface for allowing different types of pods allocation.
+ *
+ * The internal Spark implementations are [[StatefulsetPodsAllocator]]
+ * and [[ExecutorPodsAllocator]]. This may be useful for folks integrating with custom schedulers
+ * such as Volcano, Yunikorn, etc.
+ *
+ * This API may change or be removed at anytime.
+ *
+ * @since 3.3.0
+ */
+@DeveloperApi
+abstract class AbstractPodsAllocator {
+  /*
+   * Set the total expected executors for an application
+   */
+  def setTotalExpectedExecutors(resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Unit
+  /*
+   * Reference to driver pod.
+   */
+  def driverPod: Option[Pod]
+  /*
+   * If the pod for a given exec id is deleted.
+   */
+  def isDeleted(executorId: String): Boolean
+  /*
+   * Start hook.
+   */
+  def start(applicationId: String, schedulerBackend: KubernetesClusterSchedulerBackend): Unit
+  /*
+   * Stop hook
+   */
+  def stop(applicationId: String): Unit
+}
diff --git a/...es/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/...es/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
@@ -37,13 +37,13 @@ import org.apache.spark.internal.config.DYN_ALLOCATION_EXECUTOR_IDLE_TIMEOUT
 import org.apache.spark.resource.ResourceProfile
 import org.apache.spark.util.{Clock, Utils}
 
-private[spark] class ExecutorPodsAllocator(
+class ExecutorPodsAllocator(
     conf: SparkConf,
     secMgr: SecurityManager,
     executorBuilder: KubernetesExecutorBuilder,
     kubernetesClient: KubernetesClient,
     snapshotsStore: ExecutorPodsSnapshotsStore,
-    clock: Clock) extends Logging {
+    clock: Clock) extends AbstractPodsAllocator() with Logging {
 
   private val EXECUTOR_ID_COUNTER = new AtomicInteger(0)
 
@@ -97,12 +97,15 @@ private[spark] class ExecutorPodsAllocator(
 
   private var lastSnapshot = ExecutorPodsSnapshot()
 
+  private var appId: String = _
+
   // Executors that have been deleted by this allocator but not yet detected as deleted in
   // a snapshot from the API server. This is used to deny registration from these executors
   // if they happen to come up before the deletion takes effect.
   @volatile private var deletedExecutorIds = Set.empty[Long]
 
   def start(applicationId: String, schedulerBackend: KubernetesClusterSchedulerBackend): Unit = {
+    appId = applicationId
     driverPod.foreach { pod =>
       // Wait until the driver pod is ready before starting executors, as the headless service won't
       // be resolvable by DNS until the driver pod is ready.
@@ -461,6 +464,16 @@ private[spark] class ExecutorPodsAllocator(
         true
     }
   }
+
+  override def stop(applicationId: String): Unit = {
+    Utils.tryLogNonFatalError {
+      kubernetesClient
+        .pods()
+        .withLabel(SPARK_APP_ID_LABEL, applicationId)
+        .withLabel(SPARK_ROLE_LABEL, SPARK_POD_EXECUTOR_ROLE)
+        .delete()
+    }
+  }
 }
 
 private[spark] object ExecutorPodsAllocator {
@@ -471,5 +484,4 @@ private[spark] object ExecutorPodsAllocator {
     val r = slots % consumers.size
     consumers.take(r).map((_, d + 1)) ++ consumers.takeRight(consumers.size - r).map((_, d))
   }
-
 }
diff --git a/...tes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala b/...tes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala
@@ -60,8 +60,15 @@ object ExecutorPodsSnapshot extends Logging {
   }
 
   private def toStatesByExecutorId(executorPods: Seq[Pod]): Map[Long, ExecutorPodState] = {
-    executorPods.map { pod =>
-      (pod.getMetadata.getLabels.get(SPARK_EXECUTOR_ID_LABEL).toLong, toState(pod))
+    executorPods.flatMap { pod =>
+      pod.getMetadata.getLabels.get(SPARK_EXECUTOR_ID_LABEL) match {
+        case "EXECID" | null =>
+          // The exec label has not yet been assigned
+          None
+        case id =>
+          // We have a "real" id label
+          Some((id.toLong, toState(pod)))
+      }
     }.toMap
   }