-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-6640][Core] Fix the race condition of creating HeartbeatReceiver and retrieving HeartbeatReceiver #5306
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,16 +37,20 @@ private[spark] case class Heartbeat( | |
| taskMetrics: Array[(Long, TaskMetrics)], // taskId -> TaskMetrics | ||
| blockManagerId: BlockManagerId) | ||
|
|
||
| private[spark] case class RegisterTaskScheduler(scheduler: TaskScheduler) | ||
|
|
||
| private[spark] case object ExpireDeadHosts | ||
|
|
||
| private[spark] case class HeartbeatResponse(reregisterBlockManager: Boolean) | ||
|
|
||
| /** | ||
| * Lives in the driver to receive heartbeats from executors.. | ||
| */ | ||
| private[spark] class HeartbeatReceiver(sc: SparkContext, scheduler: TaskScheduler) | ||
| private[spark] class HeartbeatReceiver(sc: SparkContext) | ||
| extends Actor with ActorLogReceive with Logging { | ||
|
|
||
| private var scheduler: TaskScheduler = null | ||
|
|
||
| // executor ID -> timestamp of when the last heartbeat from this executor was received | ||
| private val executorLastSeen = new mutable.HashMap[String, Long] | ||
|
|
||
|
|
@@ -71,12 +75,22 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, scheduler: TaskSchedule | |
| } | ||
|
|
||
| override def receiveWithLogging: PartialFunction[Any, Unit] = { | ||
| case Heartbeat(executorId, taskMetrics, blockManagerId) => | ||
| val unknownExecutor = !scheduler.executorHeartbeatReceived( | ||
| executorId, taskMetrics, blockManagerId) | ||
| val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor) | ||
| executorLastSeen(executorId) = System.currentTimeMillis() | ||
| sender ! response | ||
| case RegisterTaskScheduler(scheduler) => | ||
| this.scheduler = scheduler | ||
| case heartbeat @ Heartbeat(executorId, taskMetrics, blockManagerId) => | ||
| if (scheduler != null) { | ||
| val unknownExecutor = !scheduler.executorHeartbeatReceived( | ||
| executorId, taskMetrics, blockManagerId) | ||
| val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor) | ||
| executorLastSeen(executorId) = System.currentTimeMillis() | ||
| sender ! response | ||
| } else { | ||
| // Because Executor will sleep several seconds then send the first "Heartbeat", this case | ||
| // rarely happens. However, if it really happens, log it and ask the executor to register | ||
| // itself again. | ||
| logWarning(s"Dropping $heartbeat because TaskScheduler has not been ready yet") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is not ready yet |
||
| sender ! HeartbeatResponse(reregisterBlockManager = true) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the need for reregistering vs. just ignoring the heartbeat?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For |
||
| } | ||
| case ExpireDeadHosts => | ||
| expireDeadHosts() | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -356,11 +356,17 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli | |
| val sparkUser = Utils.getCurrentUserName() | ||
| executorEnvs("SPARK_USER") = sparkUser | ||
|
|
||
| // We need to register "HeartbeatReceiver" before "createTaskScheduler" because Executor will | ||
| // retrieve "HeartbeatReceiver" in the constructor. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great, mind adding |
||
| private val heartbeatReceiver = env.actorSystem.actorOf( | ||
| Props(new HeartbeatReceiver(this)), "HeartbeatReceiver") | ||
|
|
||
| // Create and start the scheduler | ||
| private[spark] var (schedulerBackend, taskScheduler) = | ||
| SparkContext.createTaskScheduler(this, master) | ||
| private val heartbeatReceiver = env.actorSystem.actorOf( | ||
| Props(new HeartbeatReceiver(this, taskScheduler)), "HeartbeatReceiver") | ||
|
|
||
| heartbeatReceiver ! RegisterTaskScheduler(taskScheduler) | ||
|
|
||
| @volatile private[spark] var dagScheduler: DAGScheduler = _ | ||
| try { | ||
| dagScheduler = new DAGScheduler(this) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sleep several seconds before sending