apache · jiangxb1987 · May 17, 2018 · Jun 4, 2018 · felixcheung · Jun 8, 2018
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDD.scala
@@ -34,6 +34,8 @@ class JavaRDD[T](val rdd: RDD[T])(implicit val classTag: ClassTag[T])
 
   // Common RDD functions
 
+  def barrier(): JavaRDD[T] = wrapRDD(rdd.barrier())
+
   /**
    * Persist this RDD with the default storage level (`MEMORY_ONLY`).
    */

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -24,7 +24,10 @@ import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.collection.JavaConverters._
 
+import py4j.GatewayServer
+
 import org.apache.spark._
+import org.apache.spark.barrier.BarrierTaskContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.util._
 
@@ -179,6 +182,21 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
         // Python version of driver
         PythonRDD.writeUTF(pythonVer, dataOut)
         // Write out the TaskContextInfo
+        val isBarrier = context.isInstanceOf[BarrierTaskContext]
+        dataOut.writeBoolean(isBarrier)
+        if (isBarrier) {
+          val port = GatewayServer.DEFAULT_PORT + 2 + context.partitionId() * 2
+          val gatewayServer = new GatewayServer(
+            context.asInstanceOf[BarrierTaskContext],
+            port,
+            port + 1,
+            GatewayServer.DEFAULT_CONNECT_TIMEOUT,
+            GatewayServer.DEFAULT_READ_TIMEOUT,
+            null)
+          // TODO: When to stop it?
+          gatewayServer.start()
+          context.addTaskCompletionListener(_ => gatewayServer.shutdown())
+        }
         dataOut.writeInt(context.stageId())
         dataOut.writeInt(context.partitionId())
         dataOut.writeInt(context.attemptNumber())

diff --git a/core/src/main/scala/org/apache/spark/barrier/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/barrier/BarrierCoordinator.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.barrier
+
+import java.util.{Timer, TimerTask}
+
+import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
+
+class BarrierCoordinator(
+    numTasks: Int,
+    timeout: Long,
+    override val rpcEnv: RpcEnv) extends ThreadSafeRpcEndpoint {
+
+  private var epoch = 0
+
+  private val timer = new Timer("BarrierCoordinator epoch increment timer")
+
+  private val syncRequests = new scala.collection.mutable.ArrayBuffer[RpcCallContext](numTasks)
+
+  private def replyIfGetAllSyncRequest(): Unit = {
+    if (syncRequests.length == numTasks) {
+      syncRequests.foreach(_.reply(()))
+      syncRequests.clear()
+      epoch += 1
+    }
+  }
+
+  override def receive: PartialFunction[Any, Unit] = {
+    case IncreaseEpoch(previousEpoch) =>
+      if (previousEpoch == epoch) {
+        syncRequests.foreach(_.sendFailure(new RuntimeException(
+          s"The coordinator cannot get all barrier sync requests within $timeout ms.")))
+        syncRequests.clear()
+        epoch += 1
+      }
+  }
+
+  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+    case RequestToSync(epoch) =>
+      if (epoch == this.epoch) {
+        if (syncRequests.isEmpty) {
+          val currentEpoch = epoch
+          timer.schedule(new TimerTask {
+            override def run(): Unit = {
+              // self can be null after this RPC endpoint is stopped.
+              if (self != null) self.send(IncreaseEpoch(currentEpoch))
+            }
+          }, timeout)
+        }
+
+        syncRequests += context
+        replyIfGetAllSyncRequest()
+      }
+  }
+
+  override def onStop(): Unit = timer.cancel()
+}
+
+private[barrier] sealed trait BarrierCoordinatorMessage extends Serializable
+
+private[barrier] case class RequestToSync(epoch: Int) extends BarrierCoordinatorMessage
+
+private[barrier] case class IncreaseEpoch(previousEpoch: Int) extends BarrierCoordinatorMessage
diff --git a/core/src/main/scala/org/apache/spark/barrier/BarrierRDD.scala b/core/src/main/scala/org/apache/spark/barrier/BarrierRDD.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.barrier
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.{Partition, TaskContext}
+import org.apache.spark.rdd.RDD
+
+
+/**
+ * An RDD that supports running MPI programme.
+ */
+class BarrierRDD[T: ClassTag](var prev: RDD[T]) extends RDD[T](prev) {
+
+  override def isBarrier(): Boolean = true
+
+  override def getPartitions: Array[Partition] = prev.partitions
+
+  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
+    prev.iterator(split, context)
+  }
+
+  override def clearDependencies() {
+    super.clearDependencies()
+    prev = null
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/barrier/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/barrier/BarrierTaskContext.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.barrier
+
+import java.util.Properties
+
+import org.apache.spark.{SparkEnv, TaskContextImpl}
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.internal.Logging
+import org.apache.spark.memory.TaskMemoryManager
+import org.apache.spark.metrics.MetricsSystem
+import org.apache.spark.util.RpcUtils
+
+class BarrierTaskContext(
+    override val stageId: Int,
+    override val stageAttemptNumber: Int,
+    override val partitionId: Int,
+    override val taskAttemptId: Long,
+    override val attemptNumber: Int,
+    override val taskMemoryManager: TaskMemoryManager,
+    localProperties: Properties,
+    @transient private val metricsSystem: MetricsSystem,
+    // The default value is only used in tests.
+    override val taskMetrics: TaskMetrics = TaskMetrics.empty)
+  // TODO make this extends TaskContext
+  extends TaskContextImpl(stageId, stageAttemptNumber, partitionId, taskAttemptId, attemptNumber,
+      taskMemoryManager, localProperties, metricsSystem, taskMetrics)
+    with Logging {
+
+  private val barrierCoordinator = {
+    val env = SparkEnv.get
+    RpcUtils.makeDriverRef(s"barrier-$stageId-$stageAttemptNumber", env.conf, env.rpcEnv)
+  }
+
+  private var epoch = 0
+
+  /**
+   * Returns an Array of executor IDs that the barrier tasks are running on.
+   */
+  def hosts(): Array[String] = {
+    val hostsStr = localProperties.getProperty("hosts", "")
+    hostsStr.trim().split(",").map(_.trim())
+  }
+
+  /**
+   * Wait to sync all the barrier tasks in the same taskSet.
+   */
+  def barrier(): Unit = synchronized {
+    barrierCoordinator.askSync[Unit](RequestToSync(epoch))
+    epoch += 1
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -226,8 +226,9 @@ private[spark] class ClientApp extends SparkApplication {
   override def start(args: Array[String], conf: SparkConf): Unit = {
     val driverArgs = new ClientArguments(args)
 
+    // TODO remove this hack
     if (!conf.contains("spark.rpc.askTimeout")) {
-      conf.set("spark.rpc.askTimeout", "10s")
+      conf.set("spark.rpc.askTimeout", "900s")
     }
     Logger.getRootLogger.setLevel(driverArgs.logLevel)
 

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -35,6 +35,7 @@ import org.apache.spark._
 import org.apache.spark.Partitioner._
 import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.barrier.BarrierRDD
 import org.apache.spark.internal.Logging
 import org.apache.spark.partial.BoundedDouble
 import org.apache.spark.partial.CountEvaluator
@@ -43,8 +44,7 @@ import org.apache.spark.partial.PartialResult
 import org.apache.spark.storage.{RDDBlockId, StorageLevel}
 import org.apache.spark.util.{BoundedPriorityQueue, Utils}
 import org.apache.spark.util.collection.{OpenHashMap, Utils => collectionUtils}
-import org.apache.spark.util.random.{BernoulliCellSampler, BernoulliSampler, PoissonSampler,
-  SamplingUtils}
+import org.apache.spark.util.random.{BernoulliCellSampler, BernoulliSampler, PoissonSampler, SamplingUtils}
 
 /**
  * A Resilient Distributed Dataset (RDD), the basic abstraction in Spark. Represents an immutable,
@@ -1317,6 +1317,8 @@ abstract class RDD[T: ClassTag](
     }
   }
 
+  def barrier(): BarrierRDD[T] = withScope(new BarrierRDD[T](this))
+
   /**
    * Take the first num elements of the RDD. It works by first scanning one partition, and use the
    * results from that partition to estimate the number of additional partitions needed to satisfy
@@ -1839,6 +1841,11 @@ abstract class RDD[T: ClassTag](
   def toJavaRDD() : JavaRDD[T] = {
     new JavaRDD(this)(elementClassTag)
   }
+
+  /**
+   * Whether the RDD is a BarrierRDD.
+   */
+  def isBarrier(): Boolean = dependencies.exists(_.rdd.isBarrier())
 }
 
 

diff --git a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala
@@ -110,4 +110,6 @@ class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag](
     super.clearDependencies()
     prev = null
   }
+
+  override def isBarrier(): Boolean = false
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1062,7 +1062,7 @@ class DAGScheduler(
             stage.pendingPartitions += id
             new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
               taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
-              Option(sc.applicationId), sc.applicationAttemptId)
+              Option(sc.applicationId), sc.applicationAttemptId, stage.rdd.isBarrier)
           }
 
         case stage: ResultStage =>
@@ -1072,7 +1072,8 @@ class DAGScheduler(
             val locs = taskIdToLocations(id)
             new ResultTask(stage.id, stage.latestInfo.attemptNumber,
               taskBinary, part, locs, id, properties, serializedTaskMetrics,
-              Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
+              Option(jobId), Option(sc.applicationId), sc.applicationAttemptId,
+              stage.rdd.isBarrier())
           }
       }
     } catch {
@@ -1310,6 +1311,44 @@ class DAGScheduler(
             }
         }
 
+      case failure: TaskFailedReason if task.isBarrier =>
+        // Always fail the current stage and retry all the tasks when a barrier task fail.
+        val failedStage = stageIdToStage(task.stageId)
+        logInfo(s"Marking $failedStage (${failedStage.name}) as failed " +
+          "due to a barrier task failed.")
+        val message = "Stage failed because a barrier task finished unsuccessfully. " +
+          s"${failure.toErrorString}"
+        try { // cancelTasks will fail if a SchedulerBackend does not implement killTask
+          taskScheduler.cancelTasks(stageId, interruptThread = false)
+        } catch {
+          case e: UnsupportedOperationException =>
+            logInfo(s"Could not cancel tasks for stage $stageId", e)
+        }
+        markStageAsFinished(failedStage, Some(message))
+
+        failedStage.fetchFailedAttemptIds.add(task.stageAttemptId)
+        val shouldAbortStage =
+          failedStage.fetchFailedAttemptIds.size >= maxConsecutiveStageAttempts ||
+            disallowStageRetryForTest
+
+        if (shouldAbortStage) {
+          val abortMessage = if (disallowStageRetryForTest) {
+            "Barrier stage will not retry stage due to testing config"
+          } else {
+            s"""$failedStage (${failedStage.name})
+               |has failed the maximum allowable number of
+               |times: $maxConsecutiveStageAttempts.
+               |Most recent failure reason: $message""".stripMargin.replaceAll("\n", " ")
+          }
+          abortStage(failedStage, abortMessage, None)
+        } else { // update failedStages and make sure a ResubmitFailedStages event is enqueued
+          failedStages += failedStage
+          logInfo(s"Resubmitting $failedStage (${failedStage.name}) due to barrier stage failure.")
+          messageScheduler.schedule(new Runnable {
+            override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages)
+          }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS)
+        }
+
       case Resubmitted =>
         logInfo("Resubmitted " + task + ", so marking it as still running")
         stage match {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -48,7 +48,8 @@ import org.apache.spark.rdd.RDD
  * @param jobId id of the job this task belongs to
  * @param appId id of the app this task belongs to
  * @param appAttemptId attempt id of the app this task belongs to
-  */
+ * @param isBarrier whether this task belongs to a barrier sync stage.
+ */
 private[spark] class ResultTask[T, U](
     stageId: Int,
     stageAttemptId: Int,
@@ -60,9 +61,10 @@ private[spark] class ResultTask[T, U](
     serializedTaskMetrics: Array[Byte],
     jobId: Option[Int] = None,
     appId: Option[String] = None,
-    appAttemptId: Option[String] = None)
+    appAttemptId: Option[String] = None,
+    isBarrier: Boolean = false)
   extends Task[U](stageId, stageAttemptId, partition.index, localProperties, serializedTaskMetrics,
-    jobId, appId, appAttemptId)
+    jobId, appId, appAttemptId, isBarrier)
   with Serializable {
 
   @transient private[this] val preferredLocs: Seq[TaskLocation] = {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -49,6 +49,7 @@ import org.apache.spark.shuffle.ShuffleWriter
  * @param jobId id of the job this task belongs to
  * @param appId id of the app this task belongs to
  * @param appAttemptId attempt id of the app this task belongs to
+ * @param isBarrier whether this task belongs to a barrier sync stage.
  */
 private[spark] class ShuffleMapTask(
     stageId: Int,
@@ -60,9 +61,10 @@ private[spark] class ShuffleMapTask(
     serializedTaskMetrics: Array[Byte],
     jobId: Option[Int] = None,
     appId: Option[String] = None,
-    appAttemptId: Option[String] = None)
+    appAttemptId: Option[String] = None,
+    isBarrier: Boolean = false)
   extends Task[MapStatus](stageId, stageAttemptId, partition.index, localProperties,
-    serializedTaskMetrics, jobId, appId, appAttemptId)
+    serializedTaskMetrics, jobId, appId, appAttemptId, isBarrier)
   with Logging {
 
   /** A constructor used only in test suites. This does not require passing in an RDD. */
-Original file line number
+Diff line change
@@ Expand Up @@
       // Common RDD functions
+      def barrier(): JavaRDD[T] = wrapRDD(rdd.barrier())
       /**
        * Persist this RDD with the default storage level (`MEMORY_ONLY`).
        */
@@ Expand Down @@