Skip to content

Commit 68e2c72

Browse files
committed
Made replication peer selection logic more efficient.
1 parent 08afaa9 commit 68e2c72

File tree

2 files changed

+60
-18
lines changed

2 files changed

+60
-18
lines changed

core/src/main/scala/org/apache/spark/storage/BlockManager.scala

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ import java.nio.{ByteBuffer, MappedByteBuffer}
2222

2323
import scala.concurrent.ExecutionContext.Implicits.global
2424

25-
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
25+
import scala.collection.mutable
26+
import scala.collection.mutable.{ArrayBuffer, HashMap}
2627
import scala.concurrent.{Await, Future}
2728
import scala.concurrent.duration._
2829
import scala.util.Random
@@ -111,7 +112,7 @@ private[spark] class BlockManager(
111112
MetadataCleanerType.BLOCK_MANAGER, this.dropOldNonBroadcastBlocks, conf)
112113
private val broadcastCleaner = new MetadataCleaner(
113114
MetadataCleanerType.BROADCAST_VARS, this.dropOldBroadcastBlocks, conf)
114-
private val cachedPeers = new HashSet[BlockManagerId]
115+
private val cachedPeers = new mutable.HashSet[BlockManagerId]
115116
private var lastPeerFetchTime = 0L
116117

117118
initialize()
@@ -791,11 +792,10 @@ private[spark] class BlockManager(
791792
/**
792793
* Get peer block managers in the system.
793794
*/
794-
private def getPeers(forceFetch: Boolean): HashSet[BlockManagerId] = {
795-
val cachedPeersTtl = conf.getInt("spark.storage.cachedPeersTtl", 60 * 1000) // milliseconds
796-
val timeout = System.currentTimeMillis - lastPeerFetchTime > cachedPeersTtl
797-
795+
private def getPeers(forceFetch: Boolean): mutable.HashSet[BlockManagerId] = {
798796
cachedPeers.synchronized {
797+
val cachedPeersTtl = conf.getInt("spark.storage.cachedPeersTtl", 60 * 1000) // milliseconds
798+
val timeout = System.currentTimeMillis - lastPeerFetchTime > cachedPeersTtl
799799
if (cachedPeers.isEmpty || forceFetch || timeout) {
800800
cachedPeers.clear()
801801
cachedPeers ++= master.getPeers(blockManagerId).sortBy(_.hashCode)
@@ -812,27 +812,52 @@ private[spark] class BlockManager(
812812
private def replicate(blockId: BlockId, data: ByteBuffer, level: StorageLevel): Unit = {
813813
val maxReplicationFailures = conf.getInt("spark.storage.maxReplicationFailures", 1)
814814
val numPeersToReplicateTo = level.replication - 1
815-
val peersReplicatedTo = new HashSet[BlockManagerId]
816-
val peersFailedToReplicateTo = new HashSet[BlockManagerId]
815+
val peersForReplication = new ArrayBuffer[BlockManagerId]
816+
val peersReplicatedTo = new ArrayBuffer[BlockManagerId]
817+
val peersFailedToReplicateTo = new ArrayBuffer[BlockManagerId]
817818
val tLevel = StorageLevel(
818819
level.useDisk, level.useMemory, level.useOffHeap, level.deserialized, 1)
819820
val startTime = System.nanoTime
820821
val random = new Random(blockId.hashCode)
821822

822-
var forceFetchPeers = false
823+
var replicationFailed = false
823824
var failures = 0
824825
var done = false
825826

826-
// Get a random peer
827+
// Get cached list of peers
828+
peersForReplication ++= getPeers(forceFetch = false)
829+
830+
// Get a random peer. Note that this selection of a peer is deterministic on the block id.
831+
// So assuming the list of peers does not change and no replication failures,
832+
// if there are multiple attempts in the same node to replicate the same block,
833+
// the same set of peers will be selected.
827834
def getRandomPeer(): Option[BlockManagerId] = {
828-
val peers = getPeers(forceFetchPeers) -- peersReplicatedTo -- peersFailedToReplicateTo
829-
if (!peers.isEmpty) Some(peers.toSeq(random.nextInt(peers.size))) else None
835+
// If replication had failed, then force update the cached list of peers and remove the peers
836+
// that have been already used
837+
if (replicationFailed) {
838+
peersForReplication.clear()
839+
peersForReplication ++= getPeers(forceFetch = true)
840+
peersForReplication --= peersReplicatedTo
841+
peersForReplication --= peersFailedToReplicateTo
842+
}
843+
if (!peersForReplication.isEmpty) {
844+
Some(peersForReplication(random.nextInt(peersForReplication.size)))
845+
} else {
846+
None
847+
}
830848
}
831849

832850
// One by one choose a random peer and try uploading the block to it
833851
// If replication fails (e.g., target peer is down), force the list of cached peers
834852
// to be re-fetched from driver and then pick another random peer for replication. Also
835853
// temporarily black list the peer for which replication failed.
854+
//
855+
// This selection of a peer and replication is continued in a loop until one of the
856+
// following 3 conditions is fulfilled:
857+
// (i) specified number of peers have been replicated to
858+
// (ii) too many failures in replicating to peers
859+
// (iii) no peer left to replicate to
860+
//
836861
while (!done) {
837862
getRandomPeer() match {
838863
case Some(peer) =>
@@ -845,22 +870,22 @@ private[spark] class BlockManager(
845870
logTrace(s"Replicated $blockId of ${data.limit()} bytes to $peer in %f ms"
846871
.format((System.nanoTime - onePeerStartTime) / 1e6))
847872
peersReplicatedTo += peer
848-
forceFetchPeers = false
873+
peersForReplication -= peer
874+
replicationFailed = false
849875
if (peersReplicatedTo.size == numPeersToReplicateTo) {
850-
done = true
876+
done = true // specified number of peers have been replicated to
851877
}
852878
} catch {
853879
case e: Exception =>
854880
logWarning(s"Failed to replicate $blockId to $peer, failure #$failures", e)
855881
failures += 1
856-
forceFetchPeers = true
882+
replicationFailed = true
857883
peersFailedToReplicateTo += peer
858-
if (failures > maxReplicationFailures) {
884+
if (failures > maxReplicationFailures) { // too many failures in replcating to peers
859885
done = true
860886
}
861887
}
862-
case None =>
863-
// no peer left to replicate to
888+
case None => // no peer left to replicate to
864889
done = true
865890
}
866891
}

core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1316,6 +1316,13 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
13161316
testReplication(5, storageLevels)
13171317
}
13181318

1319+
test("block replication - 2x replication without peers") {
1320+
intercept[org.scalatest.exceptions.TestFailedException] {
1321+
testReplication(1,
1322+
Seq(StorageLevel.MEMORY_AND_DISK_2, StorageLevel(true, false, false, false, 3)))
1323+
}
1324+
}
1325+
13191326
test("block replication - deterministic node selection") {
13201327
val blockSize = 1000
13211328
val storeSize = 10000
@@ -1332,25 +1339,35 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter
13321339
locations
13331340
}
13341341

1342+
// Test if two attempts to 2x replication returns same set of locations
13351343
val a1Locs = putBlockAndGetLocations("a1", storageLevel2x)
13361344
assert(putBlockAndGetLocations("a1", storageLevel2x) === a1Locs,
13371345
"Inserting a 2x replicated block second time gave different locations from the first")
13381346

1347+
// Test if two attempts to 3x replication returns same set of locations
13391348
val a2Locs3x = putBlockAndGetLocations("a2", storageLevel3x)
13401349
assert(putBlockAndGetLocations("a2", storageLevel3x) === a2Locs3x,
13411350
"Inserting a 3x replicated block second time gave different locations from the first")
1351+
1352+
// Test if 2x replication of a2 returns a strict subset of the locations of 3x replication
13421353
val a2Locs2x = putBlockAndGetLocations("a2", storageLevel2x)
13431354
assert(
13441355
a2Locs2x.subsetOf(a2Locs3x),
13451356
"Inserting a with 2x replication gave locations that are not a subset of locations" +
13461357
s" with 3x replication [3x: ${a2Locs3x.mkString(",")}; 2x: ${a2Locs2x.mkString(",")}"
13471358
)
1359+
1360+
// Test if 4x replication of a2 returns a strict superset of the locations of 3x replication
13481361
val a2Locs4x = putBlockAndGetLocations("a2", storageLevel4x)
13491362
assert(
13501363
a2Locs3x.subsetOf(a2Locs4x),
13511364
"Inserting a with 4x replication gave locations that are not a superset of locations " +
13521365
s"with 3x replication [3x: ${a2Locs3x.mkString(",")}; 4x: ${a2Locs4x.mkString(",")}"
13531366
)
1367+
1368+
// Test if 3x replication of two different blocks gives two different sets of locations
1369+
val a3Locs3x = putBlockAndGetLocations("a3", storageLevel3x)
1370+
assert(a3Locs3x !== a2Locs3x, "Two blocks gave same locations with 3x replication")
13541371
}
13551372

13561373
test("block replication - addition and deletion of block managers") {

0 commit comments

Comments
 (0)