|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | + * contributor license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright ownership. |
| 5 | + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | + * (the "License"); you may not use this file except in compliance with |
| 7 | + * the License. You may obtain a copy of the License at |
| 8 | + * |
| 9 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | + * |
| 11 | + * Unless required by applicable law or agreed to in writing, software |
| 12 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | + * See the License for the specific language governing permissions and |
| 15 | + * limitations under the License. |
| 16 | + */ |
| 17 | + |
| 18 | +package org.apache.spark.streaming.kafka |
| 19 | + |
| 20 | +import kafka.common.TopicAndPartition |
| 21 | +import kafka.serializer._ |
| 22 | + |
| 23 | +import org.apache.spark.Logging |
| 24 | +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder |
| 25 | +import org.apache.spark.sql.execution.streaming.{Batch, Offset, Source, StreamingRelation} |
| 26 | +import org.apache.spark.sql.types.StructType |
| 27 | +import org.apache.spark.sql.{DataFrame, Dataset, SQLContext} |
| 28 | + |
| 29 | + |
| 30 | +/** An [[Offset]] for the [[KafkaSource]]. */ |
| 31 | +private[kafka] |
| 32 | +case class KafkaSourceOffset(offsets: Map[TopicAndPartition, Long]) extends Offset { |
| 33 | + /** |
| 34 | + * Returns a negative integer, zero, or a positive integer as this object is less than, equal to, |
| 35 | + * or greater than the specified object. |
| 36 | + */ |
| 37 | + override def compareTo(other: Offset): Int = other match { |
| 38 | + case KafkaSourceOffset(otherOffsets) => |
| 39 | + val allTopicAndPartitions = (this.offsets.keySet ++ otherOffsets.keySet).toSeq |
| 40 | + |
| 41 | + val comparisons = allTopicAndPartitions.map { tp => |
| 42 | + (this.offsets.get(tp), otherOffsets.get(tp)) match { |
| 43 | + case (Some(a), Some(b)) => |
| 44 | + if (a < b) { |
| 45 | + -1 |
| 46 | + } else if (a > b) { |
| 47 | + 1 |
| 48 | + } else { |
| 49 | + 0 |
| 50 | + } |
| 51 | + case (None, _) => -1 |
| 52 | + case (_, None) => 1 |
| 53 | + } |
| 54 | + } |
| 55 | + val nonZeroSigns = comparisons.filter { _ != 0 }.toSet |
| 56 | + nonZeroSigns.size match { |
| 57 | + case 0 => 0 // if both empty or only 0s |
| 58 | + case 1 => nonZeroSigns.head // if there are only (0s and 1s) or (0s and -1s) |
| 59 | + case _ => // there are both 1s and -1s |
| 60 | + throw new IllegalArgumentException( |
| 61 | + s"Invalid comparison between non-linear histories: $this <=> $other") |
| 62 | + } |
| 63 | + |
| 64 | + case _ => |
| 65 | + throw new IllegalArgumentException(s"Cannot compare $this <=> $other") |
| 66 | + } |
| 67 | + |
| 68 | + /** Returns a set of offset ranges between `this` and `other` */ |
| 69 | + def to(other: KafkaSourceOffset): Seq[OffsetRange] = { |
| 70 | + |
| 71 | + // Get all the partitions referenced in both sets of offsets |
| 72 | + val allTopicAndPartitions = (this.offsets.keySet ++ other.offsets.keySet).toSeq |
| 73 | + |
| 74 | + // For each partition, figure out the non-empty ranges of offsets |
| 75 | + allTopicAndPartitions.flatMap { tp => |
| 76 | + (this.offsets.get(tp), other.offsets.get(tp)) match { |
| 77 | + |
| 78 | + // Data was read till fromOffset and needs to be read till untilOffset |
| 79 | + case (Some(fromOffset), Some(untilOffset)) => |
| 80 | + if (untilOffset > fromOffset) { |
| 81 | + Some(OffsetRange(tp, fromOffset, untilOffset)) |
| 82 | + } else None |
| 83 | + |
| 84 | + // TODO: Support cases where topic+partitions are missing from one. Can happen in case of |
| 85 | + // repartitioning. |
| 86 | + |
| 87 | + case _ => |
| 88 | + None |
| 89 | + } |
| 90 | + } |
| 91 | + } |
| 92 | + |
| 93 | + override def toString(): String = { |
| 94 | + offsets.toSeq.sortBy(_._1.topic).mkString("[", ", ", "]") |
| 95 | + } |
| 96 | +} |
| 97 | + |
| 98 | +/** Companion object of the [[KafkaSourceOffset]] */ |
| 99 | +private[kafka] object KafkaSourceOffset { |
| 100 | + |
| 101 | + /** Returns [[KafkaSourceOffset]] from a Option[Offset]. */ |
| 102 | + def from(offsetOption: Option[Offset]): Option[KafkaSourceOffset] = { |
| 103 | + offsetOption.map { offset => |
| 104 | + offset match { |
| 105 | + case o: KafkaSourceOffset => o |
| 106 | + case _ => |
| 107 | + throw new IllegalArgumentException( |
| 108 | + s"Invalid conversion from offset of ${offset.getClass} to KafkaSourceOffset") |
| 109 | + } |
| 110 | + } |
| 111 | + } |
| 112 | + |
| 113 | + /** |
| 114 | + * Returns [[KafkaSourceOffset]] from a variable sequence of (topic, partitionId, offset) |
| 115 | + * tuples. |
| 116 | + */ |
| 117 | + def apply(data: (String, Int, Long)*): KafkaSourceOffset = { |
| 118 | + val map = data.map { case (topic, partition, offset) => |
| 119 | + TopicAndPartition(topic, partition) -> offset }.toMap |
| 120 | + KafkaSourceOffset(map) |
| 121 | + } |
| 122 | +} |
| 123 | + |
| 124 | + |
| 125 | +/** A [[Source]] that reads data from Kafka */ |
| 126 | +private[kafka] case class KafkaSource( |
| 127 | + topics: Set[String], |
| 128 | + params: Map[String, String])(implicit sqlContext: SQLContext) extends Source with Logging { |
| 129 | + |
| 130 | + type OffsetMap = Map[TopicAndPartition, Long] |
| 131 | + |
| 132 | + implicit private val encoder = ExpressionEncoder.tuple( |
| 133 | + ExpressionEncoder[Array[Byte]](), ExpressionEncoder[Array[Byte]]()) |
| 134 | + |
| 135 | + @transient private val logicalPlan = StreamingRelation(this) |
| 136 | + @transient private val kc = new KafkaCluster(params) |
| 137 | + @transient private val topicAndPartitions = KafkaCluster.checkErrors(kc.getPartitions(topics)) |
| 138 | + @transient private[kafka] val initialOffsets = getInitialOffsets() |
| 139 | + |
| 140 | + override def schema: StructType = encoder.schema |
| 141 | + |
| 142 | + /** Returns the next batch of data that is available after `start`, if any is available. */ |
| 143 | + override def getNextBatch(start: Option[Offset]): Option[Batch] = { |
| 144 | + val beginOffset: KafkaSourceOffset = KafkaSourceOffset.from(start).getOrElse(initialOffsets) |
| 145 | + val latestOffset = getLatestOffsets() |
| 146 | + logDebug(s"Latest offset: $latestOffset") |
| 147 | + |
| 148 | + val offsetRanges = beginOffset to latestOffset |
| 149 | + val kafkaParams = params |
| 150 | + val encodingFunc = encoder.toRow _ |
| 151 | + val sparkContext = sqlContext.sparkContext |
| 152 | + |
| 153 | + if (offsetRanges.nonEmpty) { |
| 154 | + val rdd = KafkaUtils.createRDD[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder]( |
| 155 | + sparkContext, kafkaParams, offsetRanges.toArray) |
| 156 | + logInfo(s"Creating DF with offset ranges: $offsetRanges") |
| 157 | + Some(new Batch(latestOffset, sqlContext.createDataset(rdd).toDF)) |
| 158 | + } else { |
| 159 | + None |
| 160 | + } |
| 161 | + } |
| 162 | + |
| 163 | + def toDS(): Dataset[(Array[Byte], Array[Byte])] = { |
| 164 | + toDF.as[(Array[Byte], Array[Byte])] |
| 165 | + } |
| 166 | + |
| 167 | + def toDF(): DataFrame = { |
| 168 | + new DataFrame(sqlContext, logicalPlan) |
| 169 | + } |
| 170 | + |
| 171 | + /** Get latest offsets from Kafka. */ |
| 172 | + private def getLatestOffsets(): KafkaSourceOffset = { |
| 173 | + val partitionLeaders = KafkaCluster.checkErrors(kc.findLeaders(topicAndPartitions)) |
| 174 | + val leadersAndOffsets = KafkaCluster.checkErrors(kc.getLatestLeaderOffsets(topicAndPartitions)) |
| 175 | + KafkaSourceOffset(leadersAndOffsets.map { x => (x._1, x._2.offset) }) |
| 176 | + } |
| 177 | + |
| 178 | + /** Get the initial offsets from Kafka for the source to start from. */ |
| 179 | + private def getInitialOffsets(): KafkaSourceOffset = { |
| 180 | + if (params.get("auto.offset.reset").map(_.toLowerCase) == Some("smallest")) { |
| 181 | + val offsetMap = KafkaCluster.checkErrors( |
| 182 | + kc.getEarliestLeaderOffsets(topicAndPartitions)).mapValues(_.offset) |
| 183 | + KafkaSourceOffset(offsetMap) |
| 184 | + } else { |
| 185 | + getLatestOffsets() |
| 186 | + } |
| 187 | + } |
| 188 | + |
| 189 | + override def toString(): String = s"KafkaSource[${topics.mkString(", ")}]" |
| 190 | +} |
0 commit comments