Skip to content

Commit 0df3180

Browse files
committed
Address code review comments
1 parent e7d5449 commit 0df3180

File tree

3 files changed

+70
-39
lines changed

3 files changed

+70
-39
lines changed

core/src/main/scala/org/apache/spark/MapOutputTracker.scala

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import java.io._
2121
import java.util.concurrent.ConcurrentHashMap
2222
import java.util.zip.{GZIPInputStream, GZIPOutputStream}
2323

24-
import scala.collection.mutable.{HashSet, Map, HashMap}
24+
import scala.collection.mutable.{HashMap, HashSet, Map}
2525
import scala.collection.JavaConversions._
2626
import scala.reflect.ClassTag
2727

@@ -30,6 +30,7 @@ import org.apache.spark.scheduler.MapStatus
3030
import org.apache.spark.shuffle.MetadataFetchFailedException
3131
import org.apache.spark.storage.BlockManagerId
3232
import org.apache.spark.util._
33+
import org.apache.spark.util.collection.{Utils => CollectionUtils}
3334

3435
private[spark] sealed trait MapOutputTrackerMessage
3536
private[spark] case class GetMapOutputStatuses(shuffleId: Int)
@@ -232,11 +233,10 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
232233
protected val mapStatuses = new TimeStampedHashMap[Int, Array[MapStatus]]()
233234
private val cachedSerializedStatuses = new TimeStampedHashMap[Int, Array[Byte]]()
234235

235-
// For each shuffleId we also maintain a Map from reducerId -> (location, size)
236+
// For each shuffleId we also maintain a Map from reducerId -> (locations with largest outputs)
236237
// Lazily populated whenever the statuses are requested from DAGScheduler
237-
private val statusByReducer =
238-
new TimeStampedHashMap[Int, HashMap[Int, Array[(BlockManagerId, Long)]]]()
239-
238+
private val shuffleIdToReduceLocations =
239+
new TimeStampedHashMap[Int, HashMap[Int, Array[BlockManagerId]]]()
240240

241241
// For cleaning up TimeStampedHashMaps
242242
private val metadataCleaner =
@@ -283,38 +283,47 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
283283
override def unregisterShuffle(shuffleId: Int) {
284284
mapStatuses.remove(shuffleId)
285285
cachedSerializedStatuses.remove(shuffleId)
286-
statusByReducer.remove(shuffleId)
286+
shuffleIdToReduceLocations.remove(shuffleId)
287287
}
288288

289289
/** Check if the given shuffle is being tracked */
290290
def containsShuffle(shuffleId: Int): Boolean = {
291291
cachedSerializedStatuses.contains(shuffleId) || mapStatuses.contains(shuffleId)
292292
}
293293

294-
// Return the list of locations and blockSizes for each reducer.
295-
// The map is keyed by reducerId and for each reducer the value contains the array
296-
// of (location, size) of map outputs.
297-
//
298-
// This method is not thread-safe
299-
def getStatusByReducer(
294+
/**
295+
* Return a list of locations which have the largest map outputs given a shuffleId
296+
* and a reducerId.
297+
*
298+
* This method is not thread-safe
299+
*/
300+
def getLocationsWithLargestOutputs(
300301
shuffleId: Int,
301-
numReducers: Int)
302-
: Option[Map[Int, Array[(BlockManagerId, Long)]]] = {
303-
if (!statusByReducer.contains(shuffleId) && mapStatuses.contains(shuffleId)) {
302+
reducerId: Int,
303+
numReducers: Int,
304+
numTopLocs: Int)
305+
: Option[Array[BlockManagerId]] = {
306+
if (!shuffleIdToReduceLocations.contains(shuffleId) && mapStatuses.contains(shuffleId)) {
307+
// Pre-compute the top locations for each reducer and cache it
304308
val statuses = mapStatuses(shuffleId)
305-
if (statuses.length > 0) {
306-
statusByReducer(shuffleId) = new HashMap[Int, Array[(BlockManagerId, Long)]]
309+
if (statuses.nonEmpty) {
310+
val ordering = Ordering.by[(BlockManagerId, Long), Long](_._2).reverse
311+
shuffleIdToReduceLocations(shuffleId) = new HashMap[Int, Array[BlockManagerId]]
307312
var r = 0
308313
while (r < numReducers) {
314+
// Add up sizes of all blocks at the same location
309315
val locs = statuses.map { s =>
310316
(s.location, s.getSizeForBlock(r))
311-
}
312-
statusByReducer(shuffleId) += (r -> locs)
317+
}.groupBy(_._1).mapValues { sizes =>
318+
sizes.map(_._2).reduceLeft(_ + _)
319+
}.toIterator
320+
val topLocs = CollectionUtils.takeOrdered(locs, numTopLocs)(ordering)
321+
shuffleIdToReduceLocations(shuffleId) += (r -> topLocs.map(_._1).toArray)
313322
r = r + 1
314323
}
315324
}
316325
}
317-
statusByReducer.get(shuffleId)
326+
shuffleIdToReduceLocations.get(shuffleId).flatMap(_.get(reducerId))
318327
}
319328

320329
def incrementEpoch() {
@@ -364,7 +373,7 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
364373
private def cleanup(cleanupTime: Long) {
365374
mapStatuses.clearOldValues(cleanupTime)
366375
cachedSerializedStatuses.clearOldValues(cleanupTime)
367-
statusByReducer.clearOldValues(cleanupTime)
376+
shuffleIdToReduceLocations.clearOldValues(cleanupTime)
368377
}
369378
}
370379

core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ import org.apache.spark.rdd.RDD
3838
import org.apache.spark.storage._
3939
import org.apache.spark.unsafe.memory.TaskMemoryManager
4040
import org.apache.spark.util._
41-
import org.apache.spark.util.collection.{Utils => CollectionUtils}
4241
import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat
4342

4443
/**
@@ -139,10 +138,11 @@ class DAGScheduler(
139138
taskScheduler.setDAGScheduler(this)
140139

141140
// Number of map, reduce tasks above which we do not assign preferred locations
142-
// based on map output sizes.
143-
private val SHUFFLE_PREF_MAP_THRESHOLD = 1000
141+
// based on map output sizes. We limit the size of jobs for which assign preferred locations
142+
// as sorting the locations by size becomes expensive.
143+
private[this] val SHUFFLE_PREF_MAP_THRESHOLD = 1000
144144
// NOTE: This should be less than 2000 as we use HighlyCompressedMapStatus beyond that
145-
private val SHUFFLE_PREF_REDUCE_THRESHOLD = 1000
145+
private[this] val SHUFFLE_PREF_REDUCE_THRESHOLD = 1000
146146
// Number of preferred locations to use for reducer tasks
147147
private[scheduler] val NUM_REDUCER_PREF_LOCS = 5
148148

@@ -1393,31 +1393,28 @@ class DAGScheduler(
13931393
if (rddPrefs.nonEmpty) {
13941394
return rddPrefs.map(TaskLocation(_))
13951395
}
1396-
// If the RDD has narrow dependencies, pick the first partition of the first narrow dep
1397-
// that has any placement preferences. Ideally we would choose based on transfer sizes,
1398-
// but this will do for now.
1396+
13991397
rdd.dependencies.foreach {
14001398
case n: NarrowDependency[_] =>
1399+
// If the RDD has narrow dependencies, pick the first partition of the first narrow dep
1400+
// that has any placement preferences. Ideally we would choose based on transfer sizes,
1401+
// but this will do for now.
14011402
for (inPart <- n.getParents(partition)) {
14021403
val locs = getPreferredLocsInternal(n.rdd, inPart, visited)
14031404
if (locs != Nil) {
14041405
return locs
14051406
}
14061407
}
14071408
case s: ShuffleDependency[_, _, _] =>
1409+
// For shuffle dependencies, pick the 5 locations with the largest map outputs as preferred
1410+
// locations
14081411
if (rdd.partitions.size < SHUFFLE_PREF_REDUCE_THRESHOLD &&
14091412
s.rdd.partitions.size < SHUFFLE_PREF_MAP_THRESHOLD) {
1410-
// Assign preferred locations for reducers by looking at map output location and sizes
1411-
val mapStatuses = mapOutputTracker.getStatusByReducer(s.shuffleId, rdd.partitions.size)
1412-
mapStatuses.map { status =>
1413-
// Get the map output locations for this reducer
1414-
if (status.contains(partition)) {
1415-
// Select first few locations as preferred locations for the reducer
1416-
val topLocs = CollectionUtils.takeOrdered(
1417-
status(partition).iterator, NUM_REDUCER_PREF_LOCS)(
1418-
Ordering.by[(BlockManagerId, Long), Long](_._2).reverse).toSeq
1419-
return topLocs.map(_._1).map(loc => TaskLocation(loc.host, loc.executorId))
1420-
}
1413+
// Get the preferred map output locations for this reducer
1414+
val topLocsForReducer = mapOutputTracker.getLocationsWithLargestOutputs(s.shuffleId, partition,
1415+
rdd.partitions.size, NUM_REDUCER_PREF_LOCS)
1416+
if (topLocsForReducer.nonEmpty) {
1417+
return topLocsForReducer.get.map(loc => TaskLocation(loc.host, loc.executorId))
14211418
}
14221419
}
14231420

core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,4 +205,29 @@ class MapOutputTrackerSuite extends SparkFunSuite {
205205
// masterTracker.stop() // this throws an exception
206206
rpcEnv.shutdown()
207207
}
208+
209+
test("getLocationsWithLargestOutputs with multiple outputs in same machine") {
210+
val rpcEnv = createRpcEnv("test")
211+
val tracker = new MapOutputTrackerMaster(conf)
212+
tracker.trackerEndpoint = rpcEnv.setupEndpoint(MapOutputTracker.ENDPOINT_NAME,
213+
new MapOutputTrackerMasterEndpoint(rpcEnv, tracker, conf))
214+
// Setup 3 map tasks
215+
// on hostA with output size 1
216+
// on hostA with output size 1
217+
// on hostB with output size 2
218+
tracker.registerShuffle(10, 3)
219+
tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000),
220+
Array(1L)))
221+
tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("a", "hostA", 1000),
222+
Array(1L)))
223+
tracker.registerMapOutput(10, 2, MapStatus(BlockManagerId("b", "hostB", 1000),
224+
Array(2L)))
225+
226+
val topLocs = tracker.getLocationsWithLargestOutputs(10, 0, 1, 1)
227+
assert(topLocs.nonEmpty)
228+
assert(topLocs.get.size === 1)
229+
assert(topLocs.get.head === BlockManagerId("a", "hostA", 1000))
230+
tracker.stop()
231+
rpcEnv.shutdown()
232+
}
208233
}

0 commit comments

Comments
 (0)