tdas
diff --git a/‎external/kafka/pom.xml‎
Lines changed: 12 additions & 0 deletions b/‎external/kafka/pom.xml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala‎
Lines changed: 0 additions & 1 deletion b/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaCluster.scala‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala‎
Lines changed: 1 addition & 1 deletion b/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaInputDStream.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala‎
Lines changed: 2 additions & 2 deletions b/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaRDD.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaSource.scala‎
Lines changed: 190 additions & 0 deletions b/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaSource.scala‎
Lines changed: 190 additions & 0 deletions
diff --git a/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala‎
Lines changed: 40 additions & 14 deletions b/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaTestUtils.scala‎
Lines changed: 40 additions & 14 deletions
diff --git a/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala‎
Lines changed: 6 additions & 6 deletions b/‎external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala‎
Lines changed: 6 additions & 6 deletions
@@ -41,13 +41,25 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.kafka</groupId>
       <artifactId>kafka_${scala.binary.version}</artifactId>
 
@@ -19,7 +19,6 @@ package org.apache.spark.streaming.kafka
 
 import java.util.Properties
 
-import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.util.Random
 import scala.util.control.NonFatal
 
@@ -20,7 +20,7 @@ package org.apache.spark.streaming.kafka
 import java.util.Properties
 
 import scala.collection.Map
-import scala.reflect.{classTag, ClassTag}
+import scala.reflect.{ClassTag, classTag}
 
 import kafka.consumer.{Consumer, ConsumerConfig, ConsumerConnector, KafkaStream}
 import kafka.serializer.Decoder
 
@@ -18,7 +18,7 @@
 package org.apache.spark.streaming.kafka
 
 import scala.collection.mutable.ArrayBuffer
-import scala.reflect.{classTag, ClassTag}
+import scala.reflect.{ClassTag, classTag}
 
 import kafka.api.{FetchRequestBuilder, FetchResponse}
 import kafka.common.{ErrorMapping, TopicAndPartition}
@@ -27,10 +27,10 @@ import kafka.message.{MessageAndMetadata, MessageAndOffset}
 import kafka.serializer.Decoder
 import kafka.utils.VerifiableProperties
 
-import org.apache.spark.{Logging, Partition, SparkContext, SparkException, TaskContext}
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.NextIterator
+import org.apache.spark.{Logging, Partition, SparkContext, SparkException, TaskContext}
 
 /**
  * A batch-oriented interface for consuming from Kafka.
 
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka
+
+import kafka.common.TopicAndPartition
+import kafka.serializer._
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.execution.streaming.{Batch, Offset, Source, StreamingRelation}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, Dataset, SQLContext}
+
+
+/** An [[Offset]] for the [[KafkaSource]]. */
+private[kafka]
+case class KafkaSourceOffset(offsets: Map[TopicAndPartition, Long]) extends Offset {
+  /**
+    * Returns a negative integer, zero, or a positive integer as this object is less than, equal to,
+    * or greater than the specified object.
+    */
+  override def compareTo(other: Offset): Int = other match {
+    case KafkaSourceOffset(otherOffsets) =>
+      val allTopicAndPartitions = (this.offsets.keySet ++ otherOffsets.keySet).toSeq
+
+      val comparisons = allTopicAndPartitions.map { tp =>
+        (this.offsets.get(tp), otherOffsets.get(tp)) match {
+          case (Some(a), Some(b)) =>
+            if (a < b) {
+              -1
+            } else if (a > b) {
+              1
+            } else {
+              0
+            }
+          case (None, _) => -1
+          case (_, None) => 1
+        }
+      }
+      val nonZeroSigns = comparisons.filter { _ != 0 }.toSet
+      nonZeroSigns.size match {
+        case 0 => 0 // if both empty or only 0s
+        case 1 => nonZeroSigns.head // if there are only (0s and 1s) or (0s and -1s)
+        case _ => // there are both 1s and -1s
+          throw new IllegalArgumentException(
+            s"Invalid comparison between non-linear histories: $this <=> $other")
+      }
+
+    case _ =>
+      throw new IllegalArgumentException(s"Cannot compare $this <=> $other")
+  }
+
+  /** Returns a set of offset ranges between `this` and `other` */
+  def to(other: KafkaSourceOffset): Seq[OffsetRange] = {
+
+    // Get all the partitions referenced in both sets of offsets
+    val allTopicAndPartitions = (this.offsets.keySet ++ other.offsets.keySet).toSeq
+
+    // For each partition, figure out the non-empty ranges of offsets
+    allTopicAndPartitions.flatMap { tp =>
+      (this.offsets.get(tp), other.offsets.get(tp)) match {
+
+        // Data was read till fromOffset and needs to be read till untilOffset
+        case (Some(fromOffset), Some(untilOffset)) =>
+          if (untilOffset > fromOffset) {
+            Some(OffsetRange(tp, fromOffset, untilOffset))
+          } else None
+
+        // TODO: Support cases where topic+partitions are missing from one. Can happen in case of
+        // repartitioning.
+
+        case _ =>
+          None
+      }
+    }
+  }
+
+  override def toString(): String = {
+    offsets.toSeq.sortBy(_._1.topic).mkString("[", ", ", "]")
+  }
+}
+
+/** Companion object of the [[KafkaSourceOffset]] */
+private[kafka] object KafkaSourceOffset {
+
+  /** Returns [[KafkaSourceOffset]] from a Option[Offset]. */
+  def from(offsetOption: Option[Offset]): Option[KafkaSourceOffset] = {
+    offsetOption.map { offset =>
+      offset match {
+        case o: KafkaSourceOffset => o
+        case _ =>
+          throw new IllegalArgumentException(
+            s"Invalid conversion from offset of ${offset.getClass} to KafkaSourceOffset")
+      }
+    }
+  }
+
+  /**
+   * Returns [[KafkaSourceOffset]] from a variable sequence of (topic, partitionId, offset)
+   * tuples.
+   */
+  def apply(data: (String, Int, Long)*): KafkaSourceOffset = {
+    val map = data.map { case (topic, partition, offset) =>
+        TopicAndPartition(topic, partition) -> offset }.toMap
+    KafkaSourceOffset(map)
+  }
+}
+
+
+/** A [[Source]] that reads data from Kafka */
+private[kafka] case class KafkaSource(
+    topics: Set[String],
+    params: Map[String, String])(implicit sqlContext: SQLContext) extends Source with Logging {
+
+  type OffsetMap = Map[TopicAndPartition, Long]
+
+  implicit private val encoder = ExpressionEncoder.tuple(
+    ExpressionEncoder[Array[Byte]](), ExpressionEncoder[Array[Byte]]())
+
+  @transient private val logicalPlan = StreamingRelation(this)
+  @transient private val kc = new KafkaCluster(params)
+  @transient private val topicAndPartitions = KafkaCluster.checkErrors(kc.getPartitions(topics))
+  @transient private[kafka] val initialOffsets = getInitialOffsets()
+
+  override def schema: StructType = encoder.schema
+
+  /** Returns the next batch of data that is available after `start`, if any is available. */
+  override def getNextBatch(start: Option[Offset]): Option[Batch] = {
+    val beginOffset: KafkaSourceOffset = KafkaSourceOffset.from(start).getOrElse(initialOffsets)
+    val latestOffset = getLatestOffsets()
+    logDebug(s"Latest offset: $latestOffset")
+
+    val offsetRanges = beginOffset to latestOffset
+    val kafkaParams = params
+    val encodingFunc = encoder.toRow _
+    val sparkContext = sqlContext.sparkContext
+
+    if (offsetRanges.nonEmpty) {
+      val rdd = KafkaUtils.createRDD[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder](
+        sparkContext, kafkaParams, offsetRanges.toArray)
+      logInfo(s"Creating DF with offset ranges: $offsetRanges")
+      Some(new Batch(latestOffset, sqlContext.createDataset(rdd).toDF))
+    } else {
+      None
+    }
+  }
+
+  def toDS(): Dataset[(Array[Byte], Array[Byte])] = {
+    toDF.as[(Array[Byte], Array[Byte])]
+  }
+
+  def toDF(): DataFrame = {
+    new DataFrame(sqlContext, logicalPlan)
+  }
+
+  /** Get latest offsets from Kafka. */
+  private def getLatestOffsets(): KafkaSourceOffset = {
+    val partitionLeaders = KafkaCluster.checkErrors(kc.findLeaders(topicAndPartitions))
+    val leadersAndOffsets = KafkaCluster.checkErrors(kc.getLatestLeaderOffsets(topicAndPartitions))
+    KafkaSourceOffset(leadersAndOffsets.map { x => (x._1, x._2.offset) })
+  }
+
+  /** Get the initial offsets from Kafka for the source to start from. */
+  private def getInitialOffsets(): KafkaSourceOffset = {
+    if (params.get("auto.offset.reset").map(_.toLowerCase) == Some("smallest")) {
+      val offsetMap = KafkaCluster.checkErrors(
+        kc.getEarliestLeaderOffsets(topicAndPartitions)).mapValues(_.offset)
+      KafkaSourceOffset(offsetMap)
+    } else {
+      getLatestOffsets()
+    }
+  }
+
+  override def toString(): String = s"KafkaSource[${topics.mkString(", ")}]"
+}
@@ -20,8 +20,8 @@ package org.apache.spark.streaming.kafka
 import java.io.File
 import java.lang.{Integer => JInt}
 import java.net.InetSocketAddress
+import java.util.concurrent.{TimeUnit, TimeoutException}
 import java.util.{Map => JMap, Properties}
-import java.util.concurrent.TimeoutException
 
 import scala.annotation.tailrec
 import scala.collection.JavaConverters._
@@ -30,16 +30,17 @@ import scala.util.control.NonFatal
 
 import kafka.admin.AdminUtils
 import kafka.api.Request
-import kafka.producer.{KeyedMessage, Producer, ProducerConfig}
-import kafka.serializer.StringEncoder
+import kafka.common.TopicAndPartition
 import kafka.server.{KafkaConfig, KafkaServer}
 import kafka.utils.{ZKStringSerializer, ZkUtils}
 import org.I0Itec.zkclient.ZkClient
+import org.apache.kafka.clients.producer._
+import org.apache.kafka.common.serialization.StringSerializer
 import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer}
 
-import org.apache.spark.{Logging, SparkConf}
 import org.apache.spark.streaming.Time
 import org.apache.spark.util.Utils
+import org.apache.spark.{Logging, SparkConf}
 
 /**
  * This is a helper class for Kafka test suites. This has the functionality to set up
@@ -153,9 +154,15 @@ private[kafka] class KafkaTestUtils extends Logging {
 
   /** Create a Kafka topic and wait until it is propagated to the whole cluster */
   def createTopic(topic: String): Unit = {
-    AdminUtils.createTopic(zkClient, topic, 1, 1)
+    createTopic(topic, 1)
+  }
+
+  def createTopic(topic: String, partitions: Int): Unit = {
+    AdminUtils.createTopic(zkClient, topic, partitions, 1)
     // wait until metadata is propagated
-    waitUntilMetadataIsPropagated(topic, 0)
+    for (p <- 0 until partitions) {
+      waitUntilMetadataIsPropagated(topic, p)
+    }
   }
 
   /** Java-friendly function for sending messages to the Kafka broker */
@@ -170,11 +177,29 @@ private[kafka] class KafkaTestUtils extends Logging {
   }
 
   /** Send the array of messages to the Kafka broker */
-  def sendMessages(topic: String, messages: Array[String]): Unit = {
-    producer = new Producer[String, String](new ProducerConfig(producerConfiguration))
-    producer.send(messages.map { new KeyedMessage[String, String](topic, _ ) }: _*)
-    producer.close()
-    producer = null
+  def sendMessages(topic: String, messages: Array[String]): Seq[(String, RecordMetadata)] = {
+    producer = new KafkaProducer[String, String](producerConfiguration)
+    val offsets = try {
+      messages.map { m =>
+        val metadata =
+          producer.send(new ProducerRecord[String, String](topic, m)).get(10, TimeUnit.SECONDS)
+        (m, metadata)
+      }
+    } finally {
+      if (producer != null) {
+        producer.close()
+        producer = null
+      }
+    }
+    offsets
+  }
+
+  /** Get the latest offset of all the partitions in a topic */
+  def getLatestOffsets(topics: Set[String]): Map[TopicAndPartition, Long] = {
+    val kc = new KafkaCluster(Map("metadata.broker.list" -> brokerAddress))
+    val topicPartitions = kc.getPartitions(topics).right.get
+    val offsets = kc.getLatestLeaderOffsets(topicPartitions).right.get
+    offsets.mapValues(_.offset)
   }
 
   private def brokerConfiguration: Properties = {
@@ -191,10 +216,11 @@ private[kafka] class KafkaTestUtils extends Logging {
 
   private def producerConfiguration: Properties = {
     val props = new Properties()
-    props.put("metadata.broker.list", brokerAddress)
-    props.put("serializer.class", classOf[StringEncoder].getName)
+    props.put("bootstrap.servers", brokerAddress)
+    props.put("value.serializer", classOf[StringSerializer].getName)
+    props.put("key.serializer", classOf[StringSerializer].getName)
     // wait for all in-sync replicas to ack sends
-    props.put("request.required.acks", "-1")
+    props.put("acks", "-1")
     props
   }
 
 
@@ -27,19 +27,19 @@ import scala.reflect.ClassTag
 import com.google.common.base.Charsets.UTF_8
 import kafka.common.TopicAndPartition
 import kafka.message.MessageAndMetadata
-import kafka.serializer.{Decoder, DefaultDecoder, StringDecoder}
-import net.razorvine.pickle.{IObjectPickler, Opcodes, Pickler}
+import kafka.serializer.{DefaultDecoder, Decoder, StringDecoder}
+import net.razorvine.pickle.{Opcodes, Pickler, IObjectPickler}
 
-import org.apache.spark.{SparkContext, SparkException}
-import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
 import org.apache.spark.api.java.function.{Function => JFunction}
+import org.apache.spark.streaming.util.WriteAheadLogUtils
+import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
 import org.apache.spark.api.python.SerDeUtil
 import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.StreamingContext
 import org.apache.spark.streaming.api.java._
 import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream}
-import org.apache.spark.streaming.util.WriteAheadLogUtils
 
 object KafkaUtils {
   /**
@@ -173,7 +173,7 @@ object KafkaUtils {
   }
 
   /** get leaders for the given offset ranges, or throw an exception */
-  private def leadersForRanges(
+  private[spark] def leadersForRanges(
       kc: KafkaCluster,
       offsetRanges: Array[OffsetRange]): Map[TopicAndPartition, (String, Int)] = {
     val topics = offsetRanges.map(o => TopicAndPartition(o.topic, o.partition)).toSet