Skip to content

Commit 93f1c69

Browse files
committed
Added network receiver information to the Streaming UI.
1 parent 56cc7fb commit 93f1c69

File tree

11 files changed

+301
-108
lines changed

11 files changed

+301
-108
lines changed

streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -351,15 +351,6 @@ abstract class DStream[T: ClassTag] (
351351
dependencies.foreach(_.clearMetadata(time))
352352
}
353353

354-
/* Adds metadata to the Stream while it is running.
355-
* This method should be overwritten by sublcasses of InputDStream.
356-
*/
357-
private[streaming] def addMetadata(metadata: Any) {
358-
if (metadata != null) {
359-
logInfo("Dropping Metadata: " + metadata.toString)
360-
}
361-
}
362-
363354
/**
364355
* Refresh the list of checkpointed RDDs that will be saved along with checkpoint of
365356
* this stream. This is an internal method that should not be called directly. This is

streaming/src/main/scala/org/apache/spark/streaming/dstream/NetworkInputDStream.scala

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,23 @@
1717

1818
package org.apache.spark.streaming.dstream
1919

20-
import java.util.concurrent.ArrayBlockingQueue
2120
import java.nio.ByteBuffer
21+
import java.util.concurrent.ArrayBlockingQueue
2222

23-
import scala.collection.mutable.ArrayBuffer
23+
import scala.collection.mutable.{ArrayBuffer, HashMap}
2424
import scala.concurrent.Await
2525
import scala.concurrent.duration._
2626
import scala.reflect.ClassTag
2727

28-
import akka.actor.{Props, Actor}
28+
import akka.actor.{Actor, Props}
2929
import akka.pattern.ask
3030

31-
import org.apache.spark.streaming.util.{RecurringTimer, SystemClock}
32-
import org.apache.spark.streaming._
3331
import org.apache.spark.{Logging, SparkEnv}
34-
import org.apache.spark.rdd.{RDD, BlockRDD}
32+
import org.apache.spark.rdd.{BlockRDD, RDD}
3533
import org.apache.spark.storage.{BlockId, StorageLevel, StreamBlockId}
36-
import org.apache.spark.streaming.scheduler.{DeregisterReceiver, AddBlocks, RegisterReceiver}
34+
import org.apache.spark.streaming._
35+
import org.apache.spark.streaming.scheduler.{ReceivedBlockInfo, AddBlocks, DeregisterReceiver, RegisterReceiver}
36+
import org.apache.spark.streaming.util.{RecurringTimer, SystemClock}
3737

3838
/**
3939
* Abstract class for defining any [[org.apache.spark.streaming.dstream.InputDStream]]
@@ -48,8 +48,10 @@ import org.apache.spark.streaming.scheduler.{DeregisterReceiver, AddBlocks, Regi
4848
abstract class NetworkInputDStream[T: ClassTag](@transient ssc_ : StreamingContext)
4949
extends InputDStream[T](ssc_) {
5050

51-
// This is an unique identifier that is used to match the network receiver with the
52-
// corresponding network input stream.
51+
/** Keeps all received blocks information */
52+
private val receivedBlockInfo = new HashMap[Time, Array[ReceivedBlockInfo]]
53+
54+
/** This is an unique identifier for the network input stream. */
5355
val id = ssc.getNewNetworkStreamId()
5456

5557
/**
@@ -64,23 +66,45 @@ abstract class NetworkInputDStream[T: ClassTag](@transient ssc_ : StreamingConte
6466

6567
def stop() {}
6668

69+
/** Ask NetworkInputTracker for received data blocks and generates RDDs with them. */
6770
override def compute(validTime: Time): Option[RDD[T]] = {
6871
// If this is called for any time before the start time of the context,
6972
// then this returns an empty RDD. This may happen when recovering from a
7073
// master failure
7174
if (validTime >= graph.startTime) {
72-
val blockIds = ssc.scheduler.networkInputTracker.getBlockIds(id, validTime)
75+
val blockInfo = ssc.scheduler.networkInputTracker.getReceivedBlockInfo(id)
76+
receivedBlockInfo(validTime) = blockInfo
77+
val blockIds = blockInfo.map(_.blockId.asInstanceOf[BlockId])
7378
Some(new BlockRDD[T](ssc.sc, blockIds))
7479
} else {
7580
Some(new BlockRDD[T](ssc.sc, Array[BlockId]()))
7681
}
7782
}
83+
84+
/** Get information on received blocks. */
85+
private[streaming] def getReceivedBlockInfo(time: Time) = {
86+
receivedBlockInfo(time)
87+
}
88+
89+
/**
90+
* Clear metadata that are older than `rememberDuration` of this DStream.
91+
* This is an internal method that should not be called directly. This
92+
* implementation overrides the default implementation to clear received
93+
* block information.
94+
*/
95+
private[streaming] override def clearMetadata(time: Time) {
96+
super.clearMetadata(time)
97+
val oldReceivedBlocks = receivedBlockInfo.filter(_._1 <= (time - rememberDuration))
98+
receivedBlockInfo --= oldReceivedBlocks.keys
99+
logDebug("Cleared " + oldReceivedBlocks.size + " RDDs that were older than " +
100+
(time - rememberDuration) + ": " + oldReceivedBlocks.keys.mkString(", "))
101+
}
78102
}
79103

80104

81105
private[streaming] sealed trait NetworkReceiverMessage
82106
private[streaming] case class StopReceiver(msg: String) extends NetworkReceiverMessage
83-
private[streaming] case class ReportBlock(blockId: BlockId, metadata: Any)
107+
private[streaming] case class ReportBlock(blockId: StreamBlockId, numRecords: Long, metadata: Any)
84108
extends NetworkReceiverMessage
85109
private[streaming] case class ReportError(msg: String) extends NetworkReceiverMessage
86110

@@ -156,21 +180,20 @@ abstract class NetworkReceiver[T: ClassTag]() extends Serializable with Logging
156180
actor ! ReportError(e.toString)
157181
}
158182

159-
160183
/**
161184
* Pushes a block (as an ArrayBuffer filled with data) into the block manager.
162185
*/
163-
def pushBlock(blockId: BlockId, arrayBuffer: ArrayBuffer[T], metadata: Any, level: StorageLevel) {
186+
def pushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[T], metadata: Any, level: StorageLevel) {
164187
env.blockManager.put(blockId, arrayBuffer.asInstanceOf[ArrayBuffer[Any]], level)
165-
actor ! ReportBlock(blockId, metadata)
188+
actor ! ReportBlock(blockId, arrayBuffer.size, metadata)
166189
}
167190

168191
/**
169192
* Pushes a block (as bytes) into the block manager.
170193
*/
171-
def pushBlock(blockId: BlockId, bytes: ByteBuffer, metadata: Any, level: StorageLevel) {
194+
def pushBlock(blockId: StreamBlockId, bytes: ByteBuffer, metadata: Any, level: StorageLevel) {
172195
env.blockManager.putBytes(blockId, bytes, level)
173-
actor ! ReportBlock(blockId, metadata)
196+
actor ! ReportBlock(blockId, -1 , metadata)
174197
}
175198

176199
/** A helper actor that communicates with the NetworkInputTracker */
@@ -188,8 +211,8 @@ abstract class NetworkReceiver[T: ClassTag]() extends Serializable with Logging
188211
}
189212

190213
override def receive() = {
191-
case ReportBlock(blockId, metadata) =>
192-
tracker ! AddBlocks(streamId, Array(blockId), metadata)
214+
case ReportBlock(blockId, numRecords, metadata) =>
215+
tracker ! AddBlocks(ReceivedBlockInfo(streamId, blockId, numRecords, metadata))
193216
case ReportError(msg) =>
194217
tracker ! DeregisterReceiver(streamId, msg)
195218
case StopReceiver(msg) =>
@@ -211,7 +234,7 @@ abstract class NetworkReceiver[T: ClassTag]() extends Serializable with Logging
211234
class BlockGenerator(storageLevel: StorageLevel)
212235
extends Serializable with Logging {
213236

214-
case class Block(id: BlockId, buffer: ArrayBuffer[T], metadata: Any = null)
237+
case class Block(id: StreamBlockId, buffer: ArrayBuffer[T], metadata: Any = null)
215238

216239
val clock = new SystemClock()
217240
val blockInterval = env.conf.getLong("spark.streaming.blockInterval", 200)

streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import org.apache.spark.streaming.Time
2929
*/
3030
case class BatchInfo(
3131
batchTime: Time,
32+
receivedBlockInfo: Map[Int, Array[ReceivedBlockInfo]],
3233
submissionTime: Long,
3334
processingStartTime: Option[Long],
3435
processingEndTime: Option[Long]

streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
147147
logInfo("Batches to reschedule (" + timesToReschedule.size + " batches): " +
148148
timesToReschedule.mkString(", "))
149149
timesToReschedule.foreach(time =>
150-
jobScheduler.runJobs(time, graph.generateJobs(time))
150+
jobScheduler.submitJobSet(JobSet(time, graph.generateJobs(time)))
151151
)
152152

153153
// Restart the timer
@@ -159,7 +159,13 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
159159
private def generateJobs(time: Time) {
160160
SparkEnv.set(ssc.env)
161161
Try(graph.generateJobs(time)) match {
162-
case Success(jobs) => jobScheduler.runJobs(time, jobs)
162+
case Success(jobs) =>
163+
val receivedBlockInfo = graph.getNetworkInputStreams.map { stream =>
164+
val streamId = stream.id
165+
val receivedBlockInfo = stream.getReceivedBlockInfo(time)
166+
(streamId, receivedBlockInfo)
167+
}.toMap
168+
jobScheduler.submitJobSet(JobSet(time, jobs, receivedBlockInfo))
163169
case Failure(e) => jobScheduler.reportError("Error generating jobs for time " + time, e)
164170
}
165171
eventActor ! DoCheckpoint(time)

streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,13 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
8282
}
8383
}
8484

85-
def runJobs(time: Time, jobs: Seq[Job]) {
86-
if (jobs.isEmpty) {
87-
logInfo("No jobs added for time " + time)
85+
def submitJobSet(jobSet: JobSet) {
86+
if (jobSet.jobs.isEmpty) {
87+
logInfo("No jobs added for time " + jobSet.time)
8888
} else {
89-
val jobSet = new JobSet(time, jobs)
90-
jobSets.put(time, jobSet)
89+
jobSets.put(jobSet.time, jobSet)
9190
jobSet.jobs.foreach(job => executor.execute(new JobHandler(job)))
92-
logInfo("Added jobs for time " + time)
91+
logInfo("Added jobs for time " + jobSet.time)
9392
}
9493
}
9594

streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@ import org.apache.spark.streaming.Time
2424
* belong to the same batch.
2525
*/
2626
private[streaming]
27-
case class JobSet(time: Time, jobs: Seq[Job]) {
27+
case class JobSet(
28+
time: Time,
29+
jobs: Seq[Job],
30+
receivedBlockInfo: Map[Int, Array[ReceivedBlockInfo]] = Map.empty
31+
) {
2832

2933
private val incompleteJobs = new HashSet[Job]()
3034
private val submissionTime = System.currentTimeMillis() // when this jobset was submitted
@@ -60,6 +64,7 @@ case class JobSet(time: Time, jobs: Seq[Job]) {
6064
def toBatchInfo: BatchInfo = {
6165
new BatchInfo(
6266
time,
67+
receivedBlockInfo,
6368
submissionTime,
6469
if (processingStartTime >= 0 ) Some(processingStartTime) else None,
6570
if (processingEndTime >= 0 ) Some(processingEndTime) else None

streaming/src/main/scala/org/apache/spark/streaming/scheduler/NetworkInputTracker.scala

Lines changed: 33 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -17,26 +17,33 @@
1717

1818
package org.apache.spark.streaming.scheduler
1919

20-
import org.apache.spark.streaming.dstream.{NetworkInputDStream, NetworkReceiver}
21-
import org.apache.spark.streaming.dstream.{StopReceiver, ReportBlock, ReportError}
22-
import org.apache.spark.{SparkException, Logging, SparkEnv}
23-
import org.apache.spark.SparkContext._
24-
25-
import scala.collection.mutable.HashMap
26-
import scala.collection.mutable.Queue
27-
import scala.concurrent.duration._
20+
import scala.collection.mutable.{HashMap, SynchronizedQueue, SynchronizedMap}
2821

2922
import akka.actor._
30-
import akka.pattern.ask
31-
import akka.dispatch._
32-
import org.apache.spark.storage.BlockId
33-
import org.apache.spark.streaming.{Time, StreamingContext}
23+
24+
import org.apache.spark.{Logging, SparkEnv, SparkException}
25+
import org.apache.spark.SparkContext._
26+
import org.apache.spark.storage.StreamBlockId
27+
import org.apache.spark.streaming.{StreamingContext, Time}
28+
import org.apache.spark.streaming.dstream.{NetworkReceiver, StopReceiver}
3429
import org.apache.spark.util.AkkaUtils
3530

31+
/** Information about block received by the network receiver */
32+
case class ReceivedBlockInfo(
33+
streamId: Int,
34+
blockId: StreamBlockId,
35+
numRecords: Long,
36+
metadata: Any
37+
)
38+
39+
/**
40+
* Messages used by the NetworkReceiver and the NetworkInputTracker to communicate
41+
* with each other.
42+
*/
3643
private[streaming] sealed trait NetworkInputTrackerMessage
3744
private[streaming] case class RegisterReceiver(streamId: Int, receiverActor: ActorRef)
3845
extends NetworkInputTrackerMessage
39-
private[streaming] case class AddBlocks(streamId: Int, blockIds: Seq[BlockId], metadata: Any)
46+
private[streaming] case class AddBlocks(receivedBlockInfo: ReceivedBlockInfo)
4047
extends NetworkInputTrackerMessage
4148
private[streaming] case class DeregisterReceiver(streamId: Int, msg: String)
4249
extends NetworkInputTrackerMessage
@@ -53,9 +60,10 @@ class NetworkInputTracker(ssc: StreamingContext) extends Logging {
5360
val networkInputStreamMap = Map(networkInputStreams.map(x => (x.id, x)): _*)
5461
val receiverExecutor = new ReceiverExecutor()
5562
val receiverInfo = new HashMap[Int, ActorRef]
56-
val receivedBlockIds = new HashMap[Int, Queue[BlockId]]
63+
val receivedBlockInfo = new HashMap[Int, SynchronizedQueue[ReceivedBlockInfo]]
64+
with SynchronizedMap[Int, SynchronizedQueue[ReceivedBlockInfo]]
5765
val timeout = AkkaUtils.askTimeout(ssc.conf)
58-
66+
val listenerBus = ssc.scheduler.listenerBus
5967

6068
// actor is created when generator starts.
6169
// This not being null means the tracker has been started and not stopped
@@ -87,15 +95,14 @@ class NetworkInputTracker(ssc: StreamingContext) extends Logging {
8795
}
8896

8997
/** Return all the blocks received from a receiver. */
90-
def getBlockIds(receiverId: Int, time: Time): Array[BlockId] = synchronized {
91-
val queue = receivedBlockIds.synchronized {
92-
receivedBlockIds.getOrElse(receiverId, new Queue[BlockId]())
93-
}
94-
val result = queue.synchronized {
95-
queue.dequeueAll(x => true)
96-
}
97-
logInfo("Stream " + receiverId + " received " + result.size + " blocks")
98-
result.toArray
98+
def getReceivedBlockInfo(streamId: Int): Array[ReceivedBlockInfo] = {
99+
val receivedBlockInfo = getReceivedBlockInfoQueue(streamId).dequeueAll(x => true)
100+
logInfo("Stream " + streamId + " received " + receivedBlockInfo.size + " blocks")
101+
receivedBlockInfo.toArray
102+
}
103+
104+
private def getReceivedBlockInfoQueue(streamId: Int) = {
105+
receivedBlockInfo.getOrElseUpdate(streamId, new SynchronizedQueue[ReceivedBlockInfo])
99106
}
100107

101108
/** Actor to receive messages from the receivers. */
@@ -110,17 +117,8 @@ class NetworkInputTracker(ssc: StreamingContext) extends Logging {
110117
+ sender.path.address)
111118
sender ! true
112119
}
113-
case AddBlocks(streamId, blockIds, metadata) => {
114-
val tmp = receivedBlockIds.synchronized {
115-
if (!receivedBlockIds.contains(streamId)) {
116-
receivedBlockIds += ((streamId, new Queue[BlockId]))
117-
}
118-
receivedBlockIds(streamId)
119-
}
120-
tmp.synchronized {
121-
tmp ++= blockIds
122-
}
123-
networkInputStreamMap(streamId).addMetadata(metadata)
120+
case AddBlocks(receivedBlockInfo) => {
121+
getReceivedBlockInfoQueue(receivedBlockInfo.streamId) += receivedBlockInfo
124122
}
125123
case DeregisterReceiver(streamId, msg) => {
126124
receiverInfo -= streamId

streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import org.apache.spark.util.Distribution
2323
/** Base trait for events related to StreamingListener */
2424
sealed trait StreamingListenerEvent
2525

26+
case class StreamingListenerBatchSubmitted(batchInfo: BatchInfo) extends StreamingListenerEvent
2627
case class StreamingListenerBatchCompleted(batchInfo: BatchInfo) extends StreamingListenerEvent
2728
case class StreamingListenerBatchStarted(batchInfo: BatchInfo) extends StreamingListenerEvent
2829

@@ -34,14 +35,14 @@ private[scheduler] case object StreamingListenerShutdown extends StreamingListen
3435
* computation.
3536
*/
3637
trait StreamingListener {
37-
/**
38-
* Called when processing of a batch has completed
39-
*/
38+
39+
/** Called when a batch of jobs has been submitted for processing. */
40+
def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted) { }
41+
42+
/** Called when processing of a batch of jobs has completed. */
4043
def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) { }
4144

42-
/**
43-
* Called when processing of a batch has started
44-
*/
45+
/** Called when processing of a batch of jobs has started. */
4546
def onBatchStarted(batchStarted: StreamingListenerBatchStarted) { }
4647
}
4748

0 commit comments

Comments
 (0)