-
Notifications
You must be signed in to change notification settings - Fork 7k
[core] Make CancelTask RPC Fault Tolerant #58018
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 24 commits
f8150c0
0a630a7
8ae4e3a
a733422
8a2e428
901099d
7d4ab2e
9070db5
dcec398
9df37aa
d846b90
873a17c
430a4a6
a253c81
9d5cf6f
d0fddda
c8e0ed6
49250fb
3dbcc22
73445ab
f737eef
0429f79
2f9c24e
bed7884
2a66834
0fb240e
6bab852
c1f1e0f
758ecd6
22a53f5
e053c2f
6b2674b
07da167
41fa586
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| #include <vector> | ||
|
|
||
| #include "ray/common/protobuf_utils.h" | ||
| #include "ray/core_worker/task_submission/task_submission_util.h" | ||
| #include "ray/util/time.h" | ||
|
|
||
| namespace ray { | ||
|
|
@@ -912,17 +913,16 @@ std::string ActorTaskSubmitter::DebugString(const ActorID &actor_id) const { | |
| return stream.str(); | ||
| } | ||
|
|
||
| void ActorTaskSubmitter::RetryCancelTask(TaskSpecification task_spec, | ||
| bool recursive, | ||
| int64_t milliseconds) { | ||
| void ActorTaskSubmitter::RetryCancelTask(TaskSpecification task_spec, bool recursive) { | ||
| auto delay_ms = RayConfig::instance().cancellation_retry_ms(); | ||
| RAY_LOG(DEBUG).WithField(task_spec.TaskId()) | ||
| << "Task cancelation will be retried in " << milliseconds << " ms"; | ||
| << "Task cancelation will be retried in " << delay_ms << " ms"; | ||
| execute_after( | ||
| io_service_, | ||
| [this, task_spec = std::move(task_spec), recursive] { | ||
| CancelTask(task_spec, recursive); | ||
| }, | ||
| std::chrono::milliseconds(milliseconds)); | ||
| std::chrono::milliseconds(delay_ms)); | ||
| } | ||
|
|
||
| void ActorTaskSubmitter::CancelTask(TaskSpecification task_spec, bool recursive) { | ||
|
|
@@ -997,44 +997,64 @@ void ActorTaskSubmitter::CancelTask(TaskSpecification task_spec, bool recursive) | |
| // an executor tells us to stop retrying. | ||
|
|
||
| // If there's no client, it means actor is not created yet. | ||
| // Retry in 1 second. | ||
| // Retry after the configured delay. | ||
| NodeID node_id; | ||
| std::string executor_worker_id; | ||
| { | ||
| absl::MutexLock lock(&mu_); | ||
| RAY_LOG(DEBUG).WithField(task_id) << "Task was sent to an actor. Send a cancel RPC."; | ||
| auto queue = client_queues_.find(actor_id); | ||
| RAY_CHECK(queue != client_queues_.end()); | ||
| if (!queue->second.client_address_.has_value()) { | ||
| RetryCancelTask(task_spec, recursive, 1000); | ||
| RetryCancelTask(task_spec, recursive); | ||
| return; | ||
| } | ||
|
|
||
| rpc::CancelTaskRequest request; | ||
| request.set_intended_task_id(task_spec.TaskIdBinary()); | ||
| request.set_force_kill(force_kill); | ||
| request.set_recursive(recursive); | ||
| request.set_caller_worker_id(task_spec.CallerWorkerIdBinary()); | ||
| auto client = core_worker_client_pool_.GetOrConnect(*queue->second.client_address_); | ||
| client->CancelTask(request, | ||
| [this, task_spec = std::move(task_spec), recursive, task_id]( | ||
| const Status &status, const rpc::CancelTaskReply &reply) { | ||
| RAY_LOG(DEBUG).WithField(task_spec.TaskId()) | ||
| << "CancelTask RPC response received with status " | ||
| << status.ToString(); | ||
|
|
||
| // Keep retrying every 2 seconds until a task is officially | ||
| // finished. | ||
| if (!task_manager_.GetTaskSpec(task_id)) { | ||
| // Task is already finished. | ||
| RAY_LOG(DEBUG).WithField(task_spec.TaskId()) | ||
| << "Task is finished. Stop a cancel request."; | ||
| return; | ||
| } | ||
|
|
||
| if (!reply.attempt_succeeded()) { | ||
| RetryCancelTask(task_spec, recursive, 2000); | ||
| } | ||
| }); | ||
| node_id = NodeID::FromBinary(queue->second.client_address_.value().node_id()); | ||
| executor_worker_id = queue->second.client_address_.value().worker_id(); | ||
| } | ||
|
|
||
| auto do_cancel_local_task = | ||
| [this, task_spec = std::move(task_spec), force_kill, recursive, executor_worker_id]( | ||
| const rpc::Address &raylet_address) mutable { | ||
| rpc::CancelLocalTaskRequest request; | ||
| request.set_intended_task_id(task_spec.TaskIdBinary()); | ||
| request.set_force_kill(force_kill); | ||
| request.set_recursive(recursive); | ||
| request.set_caller_worker_id(task_spec.CallerWorkerIdBinary()); | ||
| request.set_executor_worker_id(executor_worker_id); | ||
|
|
||
| auto raylet_client = raylet_client_pool_.GetOrConnectByAddress(raylet_address); | ||
| raylet_client->CancelLocalTask( | ||
| request, | ||
| [this, task_spec = std::move(task_spec), recursive]( | ||
| const Status &status, const rpc::CancelLocalTaskReply &reply) mutable { | ||
| if (!status.ok()) { | ||
| RAY_LOG(INFO) << "CancelLocalTask RPC failed for task " | ||
| << task_spec.TaskId() << ": " << status.ToString() | ||
| << " due to node death"; | ||
| return; | ||
| } else { | ||
| RAY_LOG(INFO) << "CancelLocalTask RPC response received for " | ||
| << task_spec.TaskId() | ||
| << " with attempt_succeeded: " << reply.attempt_succeeded() | ||
| << " requested_task_running: " | ||
| << reply.requested_task_running(); | ||
| } | ||
| // Keep retrying until a task is officially finished. | ||
| if (!reply.attempt_succeeded()) { | ||
Sparks0219 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| RetryCancelTask(std::move(task_spec), recursive); | ||
| } | ||
| }); | ||
| }; | ||
|
|
||
| // Cancel can execute on the user's python thread, but the GCS node cache is updated on | ||
| // the io service thread and is not thread-safe. Hence we need to post the entire | ||
| // cache access to the io service thread. | ||
| io_service_.post( | ||
| [this, node_id, do_cancel_local_task = std::move(do_cancel_local_task)]() mutable { | ||
| SendCancelLocalTask(gcs_client_, node_id, std::move(do_cancel_local_task)); | ||
| }, | ||
| "ActorTaskSubmitter.CancelTask"); | ||
|
||
| } | ||
|
|
||
| bool ActorTaskSubmitter::QueueGeneratorForResubmit(const TaskSpecification &spec) { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.