@@ -22,7 +22,7 @@ import org.apache.hadoop.conf.Configuration
2222
2323import org .apache .spark ._
2424import org .apache .spark .rdd .BlockRDD
25- import org .apache .spark .storage .{BlockId , StorageLevel }
25+ import org .apache .spark .storage .{StreamBlockId , BlockId , StorageLevel }
2626import org .apache .spark .streaming .util .{HdfsUtils , WriteAheadLogFileSegment , WriteAheadLogRandomReader }
2727
2828/**
@@ -37,6 +37,7 @@ private[streaming]
3737class WriteAheadLogBackedBlockRDDPartition (
3838 val index : Int ,
3939 val blockId : BlockId ,
40+ val isBlockIdValid : Boolean ,
4041 val segment : WriteAheadLogFileSegment )
4142 extends Partition
4243
@@ -45,11 +46,19 @@ class WriteAheadLogBackedBlockRDDPartition(
4546 * This class represents a special case of the BlockRDD where the data blocks in
4647 * the block manager are also backed by segments in write ahead logs. For reading
4748 * the data, this RDD first looks up the blocks by their ids in the block manager.
48- * If it does not find them, it looks up the corresponding file segment.
49+ * If it does not find them, it looks up the corresponding file segment. The finding
50+ * of the blocks by their ids can be skipped by setting the corresponding element in
51+ * isBlockIdValid to false. This is a performance optimization which does not affect
52+ * correctness, and it can be used in situations where it is known that the block
53+ * does not exist in the Spark executors (e.g. after a failed driver is restarted).
54+ *
4955 *
5056 * @param sc SparkContext
5157 * @param blockIds Ids of the blocks that contains this RDD's data
5258 * @param segments Segments in write ahead logs that contain this RDD's data
59+ * @param isBlockIdValid Whether the block Ids are valid (i.e., the blocks are present in the Spark
60+ * executors). If not, then block lookups by the block ids will be skipped.
61+ * By default, this is an empty array signifying true for all the blocks.
5362 * @param storeInBlockManager Whether to store in the block manager after reading from the segment
5463 * @param storageLevel storage level to store when storing in block manager
5564 * (applicable when storeInBlockManager = true)
@@ -59,23 +68,32 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag](
5968 @ transient sc : SparkContext ,
6069 @ transient blockIds : Array [BlockId ],
6170 @ transient segments : Array [WriteAheadLogFileSegment ],
62- storeInBlockManager : Boolean ,
63- storageLevel : StorageLevel )
71+ @ transient isBlockIdValid : Array [Boolean ] = Array .empty,
72+ storeInBlockManager : Boolean = false ,
73+ storageLevel : StorageLevel = StorageLevel .MEMORY_ONLY_SER )
6474 extends BlockRDD [T ](sc, blockIds) {
6575
6676 require(
6777 blockIds.length == segments.length,
68- s " Number of block ids ( ${blockIds.length}) must be " +
69- s " the same as number of segments ( ${segments.length}})! " )
78+ s " Number of block Ids ( ${blockIds.length}) must be " +
79+ s " same as number of segments ( ${segments.length}}) " )
80+
81+ require(
82+ isBlockIdValid.isEmpty || isBlockIdValid.length == blockIds.length,
83+ s " Number of elements in isBlockIdValid ( ${isBlockIdValid.length}) must be " +
84+ s " same as number of block Ids ( ${blockIds.length}) " )
7085
7186 // Hadoop configuration is not serializable, so broadcast it as a serializable.
7287 @ transient private val hadoopConfig = sc.hadoopConfiguration
7388 private val broadcastedHadoopConf = new SerializableWritable (hadoopConfig)
7489
90+ setInvalidIfBlocksRemoved(false )
91+
7592 override def getPartitions : Array [Partition ] = {
7693 assertValid()
77- Array .tabulate(blockIds.size) { i =>
78- new WriteAheadLogBackedBlockRDDPartition (i, blockIds(i), segments(i))
94+ Array .tabulate(blockIds.length) { i =>
95+ val isValid = if (isBlockIdValid.length == 0 ) true else isBlockIdValid(i)
96+ new WriteAheadLogBackedBlockRDDPartition (i, blockIds(i), isValid, segments(i))
7997 }
8098 }
8199
@@ -90,22 +108,29 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag](
90108 val blockManager = SparkEnv .get.blockManager
91109 val partition = split.asInstanceOf [WriteAheadLogBackedBlockRDDPartition ]
92110 val blockId = partition.blockId
93- blockManager.get(blockId) match {
94- case Some (block) => // Data is in Block Manager
95- val iterator = block.data.asInstanceOf [Iterator [T ]]
96- logDebug(s " Read partition data of $this from block manager, block $blockId" )
97- iterator
98- case None => // Data not found in Block Manager, grab it from write ahead log file
99- val reader = new WriteAheadLogRandomReader (partition.segment.path, hadoopConf)
100- val dataRead = reader.read(partition.segment)
101- reader.close()
102- logInfo(s " Read partition data of $this from write ahead log, segment ${partition.segment}" )
103- if (storeInBlockManager) {
104- blockManager.putBytes(blockId, dataRead, storageLevel)
105- logDebug(s " Stored partition data of $this into block manager with level $storageLevel" )
106- dataRead.rewind()
107- }
108- blockManager.dataDeserialize(blockId, dataRead).asInstanceOf [Iterator [T ]]
111+ val segment = partition.segment
112+
113+ def getBlockFromBlockManager (): Option [Iterator [T ]] = {
114+ blockManager.get(blockId).map(_.data.asInstanceOf [Iterator [T ]])
115+ }
116+
117+ def getBlockFromWriteAheadLog (): Iterator [T ] = {
118+ val reader = new WriteAheadLogRandomReader (segment.path, hadoopConf)
119+ val dataRead = reader.read(segment)
120+ reader.close()
121+ logDebug(s " Read partition data of $this from write ahead log, segment ${partition.segment}" )
122+ if (storeInBlockManager) {
123+ blockManager.putBytes(blockId, dataRead, storageLevel)
124+ logDebug(s " Stored partition data of $this into block manager with level $storageLevel" )
125+ dataRead.rewind()
126+ }
127+ blockManager.dataDeserialize(blockId, dataRead).asInstanceOf [Iterator [T ]]
128+ }
129+
130+ if (partition.isBlockIdValid) {
131+ getBlockFromBlockManager().getOrElse { getBlockFromWriteAheadLog() }
132+ } else {
133+ getBlockFromWriteAheadLog()
109134 }
110135 }
111136
@@ -116,7 +141,12 @@ class WriteAheadLogBackedBlockRDD[T: ClassTag](
116141 */
117142 override def getPreferredLocations (split : Partition ): Seq [String ] = {
118143 val partition = split.asInstanceOf [WriteAheadLogBackedBlockRDDPartition ]
119- val blockLocations = getBlockIdLocations().get(partition.blockId)
144+ val blockLocations = if (partition.isBlockIdValid) {
145+ getBlockIdLocations().get(partition.blockId)
146+ } else {
147+ None
148+ }
149+
120150 blockLocations.getOrElse(
121151 HdfsUtils .getFileSegmentLocations(
122152 partition.segment.path, partition.segment.offset, partition.segment.length, hadoopConfig))
0 commit comments