[SPARK-4964] recovery of generated rdds from checkpoint

koeninger · koeninger · commit 0458e4ebdfa0 · 2015-01-10T00:48:11.000-06:00
diff --git a/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDD.scala b/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDD.scala
@@ -31,23 +31,6 @@ import kafka.message.{MessageAndMetadata, MessageAndOffset}
 import kafka.serializer.Decoder
 import kafka.utils.VerifiableProperties
 
-
-case class KafkaRDDPartition(
-  override val index: Int,
-  /** kafka topic name */
-  topic: String,
-  /** kafka partition id */
-  partition: Int,
-  /** inclusive starting offset */
-  fromOffset: Long,
-  /** exclusive ending offset */
-  untilOffset: Long,
-  /** preferred kafka host, i.e. the leader at the time the rdd was created */
-  host: String,
-  /** preferred kafka host's port */
-  port: Int
-) extends Partition
-
 /** A batch-oriented interface for consuming from Kafka.
   * Starting and ending offsets are specified in advance,
   * so that you can control exactly-once semantics.
@@ -57,7 +40,7 @@ case class KafkaRDDPartition(
   * configuration parameters</a>.
   *   Requires "metadata.broker.list" or "bootstrap.servers" to be set with Kafka broker(s),
   *   NOT zookeeper servers, specified in host1:port1,host2:port2 form.
-  * @param rddPartitions Each RDD partition corresponds to a
+  * @param batch Each KafkaRDDPartition in the batch corresponds to a
   *   range of offsets for a given Kafka topic/partition
   * @param messageHandler function for translating each message into the desired type
   */
@@ -69,23 +52,11 @@ class KafkaRDD[
   R: ClassTag](
     sc: SparkContext,
     val kafkaParams: Map[String, String],
-    val rddPartitions: Traversable[KafkaRDDPartition],
+    val batch: Array[KafkaRDDPartition],
     messageHandler: MessageAndMetadata[K, V] => R
   ) extends RDD[R](sc, Nil) with Logging {
 
-  /** per-topic/partition Kafka offsets defining the (inclusive) starting point of the batch */
-  def fromOffsets: Map[TopicAndPartition, Long] =
-    rddPartitions.map { kr =>
-      TopicAndPartition(kr.topic, kr.partition) -> kr.fromOffset
-    }.toMap
-
-  /** per-topic/partition Kafka offsets defining the (exclusive) ending point of the batch */
-  def untilOffsets: Map[TopicAndPartition, Long] =
-    rddPartitions.map { kr =>
-      TopicAndPartition(kr.topic, kr.partition) -> kr.untilOffset
-    }.toMap
-
-  override def getPartitions: Array[Partition] = rddPartitions.toArray
+  override def getPartitions: Array[Partition] = batch.asInstanceOf[Array[Partition]]
 
   override def getPreferredLocations(thePart: Partition): Seq[String] = {
     val part = thePart.asInstanceOf[KafkaRDDPartition]
@@ -222,7 +193,7 @@ object KafkaRDD {
     val partitions  = fromOffsets.zipWithIndex.map { case ((tp, from), index) =>
       val lo = untilOffsets(tp)
       new KafkaRDDPartition(index, tp.topic, tp.partition, from, lo.offset, lo.host, lo.port)
-    }
+    }.toArray
 
     new KafkaRDD[K, V, U, T, R](sc, kafkaParams, partitions, messageHandler)
   }
diff --git a/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDDPartition.scala b/external/kafka/src/main/scala/org/apache/spark/rdd/kafka/KafkaRDDPartition.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rdd.kafka
+
+import org.apache.spark.Partition
+
+/** @param topic kafka topic name
+  * @param partition kafka partition id
+  * @param fromOffset inclusive starting offset
+  * @param untilOffset exclusive ending offset
+  * @param host preferred kafka host, i.e. the leader at the time the rdd was created
+  * @param port preferred kafka host's port
+  */
+class KafkaRDDPartition(
+  override val index: Int,
+  val topic: String,
+  val partition: Int,
+  val fromOffset: Long,
+  val untilOffset: Long,
+  val host: String,
+  val port: Int
+) extends Partition {
+  def toTuple: (Int, String, Int, Long, Long, String, Int) = (
+    index,
+    topic,
+    partition,
+    fromOffset,
+    untilOffset,
+    host,
+    port
+  )
+
+}
+
+object KafkaRDDPartition {
+  def apply(
+    index: Int,
+    topic: String,
+    partition: Int,
+    fromOffset: Long,
+    untilOffset: Long,
+    host: String,
+    port: Int
+  ): KafkaRDDPartition = new KafkaRDDPartition(
+    index,
+    topic,
+    partition,
+    fromOffset,
+    untilOffset,
+    host,
+    port
+  )
+
+  def apply(tuple: (Int, String, Int, Long, Long, String, Int)): KafkaRDDPartition = {
+    new KafkaRDDPartition(
+      tuple._1,
+      tuple._2,
+      tuple._3,
+      tuple._4,
+      tuple._5,
+      tuple._6,
+      tuple._7
+    )
+  }
+}
diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DeterministicKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DeterministicKafkaInputDStream.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.streaming.kafka
 
+
 import scala.annotation.tailrec
+import scala.collection.mutable
 import scala.reflect.{classTag, ClassTag}
 
 import kafka.common.TopicAndPartition
@@ -26,7 +28,7 @@ import kafka.serializer.Decoder
 
 import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
-import org.apache.spark.rdd.kafka.{KafkaCluster, KafkaRDD}
+import org.apache.spark.rdd.kafka.{KafkaCluster, KafkaRDD, KafkaRDDPartition}
 import org.apache.spark.rdd.kafka.KafkaCluster.LeaderOffset
 import org.apache.spark.streaming.{StreamingContext, Time}
 import org.apache.spark.streaming.dstream._
@@ -62,6 +64,8 @@ class DeterministicKafkaInputDStream[
     maxRetries: Int = 1
 ) extends InputDStream[R](ssc_) with Logging {
 
+  protected[streaming] override val checkpointData = new DeterministicKafkaInputDStreamCheckpointData
+
   private val kc = new KafkaCluster(kafkaParams)
 
   private val maxMessagesPerPartition: Option[Long] = {
@@ -117,4 +121,29 @@ class DeterministicKafkaInputDStream[
 
   def stop(): Unit = {
   }
+
+  private[streaming]
+  class DeterministicKafkaInputDStreamCheckpointData extends DStreamCheckpointData(this) {
+    def batchForTime = data.asInstanceOf[mutable.HashMap[
+      Time, Array[(Int, String, Int, Long, Long, String, Int)]]]
+
+    override def update(time: Time) {
+      batchForTime.clear()
+      generatedRDDs.foreach { kv =>
+        val a = kv._2.asInstanceOf[KafkaRDD[K, V, U, T, R]].batch.map(_.toTuple).toArray
+        batchForTime += kv._1 -> a
+      }
+    }
+
+    override def cleanup(time: Time) { }
+
+    override def restore() {
+      batchForTime.toSeq.sortBy(_._1)(Time.ordering).foreach { case (t, b) =>
+          logInfo(s"Restoring KafkaRDD for time $t ${b.mkString("[", ", ", "]")}")
+          generatedRDDs += t -> new KafkaRDD[K, V, U, T, R](
+            context.sparkContext, kafkaParams, b.map(KafkaRDDPartition(_)), messageHandler)
+      }
+    }
+  }
+
 }
diff --git a/external/kafka/src/test/scala/org/apache/spark/rdd/kafka/KafkaRDDSuite.scala b/external/kafka/src/test/scala/org/apache/spark/rdd/kafka/KafkaRDDSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.rdd.kafka
 import scala.util.Random
 
 import kafka.serializer.StringDecoder
+import kafka.common.TopicAndPartition
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark._
@@ -57,7 +58,9 @@ class KafkaRDDSuite extends KafkaStreamSuiteBase with BeforeAndAfter {
     assert(rdd.isDefined)
     assert(rdd.get.count === sent.values.sum)
 
-    kc.setConsumerOffsets(kafkaParams("group.id"), rdd.get.untilOffsets)
+    kc.setConsumerOffsets(
+      kafkaParams("group.id"),
+      rdd.get.batch.map(kp => TopicAndPartition(kp.topic, kp.partition) -> kp.untilOffset).toMap)
 
     val rdd2 = getRdd(kc, Set(topic))
     val sent2 = Map("d" -> 1)