-
Notifications
You must be signed in to change notification settings - Fork 7k
[core] Make KillActor RPC Fault Tolerant #57648
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
321b2a9
092122c
5fa01aa
35b6e04
6fc4162
74b7931
49f8feb
2d4e2f8
c6e3fd8
90bd504
826ed8c
2b9f5b2
7a57cce
8d64c7a
8acdee7
cee9324
6f39e12
ae808da
e58c6c3
04341f7
3a92b47
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -228,7 +228,7 @@ void GcsActorScheduler::CancelOnLeasing(const NodeID &node_id, | |
| address.set_node_id(node_info->node_id()); | ||
| address.set_ip_address(node_info->node_manager_address()); | ||
| address.set_port(node_info->node_manager_port()); | ||
| auto raylet_client = GetOrConnectRayletClient(address); | ||
| auto raylet_client = raylet_client_pool_.GetOrConnectByAddress(address); | ||
| raylet_client->CancelWorkerLease( | ||
| lease_id, [](const Status &status, const rpc::CancelWorkerLeaseReply &reply) {}); | ||
| } | ||
|
|
@@ -270,7 +270,7 @@ void GcsActorScheduler::ReleaseUnusedActorWorkers( | |
| address.set_node_id(alive_node.second->node_id()); | ||
| address.set_ip_address(alive_node.second->node_manager_address()); | ||
| address.set_port(alive_node.second->node_manager_port()); | ||
| auto raylet_client = GetOrConnectRayletClient(address); | ||
| auto raylet_client = raylet_client_pool_.GetOrConnectByAddress(address); | ||
| auto release_unused_workers_callback = | ||
| [this, node_id](const Status &status, | ||
| const rpc::ReleaseUnusedActorWorkersReply &reply) { | ||
|
|
@@ -309,7 +309,7 @@ void GcsActorScheduler::LeaseWorkerFromNode( | |
| remote_address.set_node_id(node->node_id()); | ||
| remote_address.set_ip_address(node->node_manager_address()); | ||
| remote_address.set_port(node->node_manager_port()); | ||
| auto raylet_client = GetOrConnectRayletClient(remote_address); | ||
| auto raylet_client = raylet_client_pool_.GetOrConnectByAddress(remote_address); | ||
| // Actor leases should be sent to the raylet immediately, so we should never build up a | ||
| // backlog in GCS. | ||
| // Counter for generating unique lease IDs. | ||
|
|
@@ -350,7 +350,9 @@ void GcsActorScheduler::DoRetryLeasingWorkerFromNode( | |
| } | ||
|
|
||
| void GcsActorScheduler::HandleWorkerLeaseGrantedReply( | ||
| std::shared_ptr<GcsActor> actor, const ray::rpc::RequestWorkerLeaseReply &reply) { | ||
| std::shared_ptr<GcsActor> actor, | ||
| const ray::rpc::RequestWorkerLeaseReply &reply, | ||
| std::shared_ptr<const rpc::GcsNodeInfo> node) { | ||
| const auto &retry_at_raylet_address = reply.retry_at_raylet_address(); | ||
| const auto &worker_address = reply.worker_address(); | ||
| if (worker_address.node_id().empty()) { | ||
|
|
@@ -390,6 +392,11 @@ void GcsActorScheduler::HandleWorkerLeaseGrantedReply( | |
| RAY_CHECK(node_to_workers_when_creating_[node_id] | ||
| .emplace(leased_worker->GetWorkerID(), leased_worker) | ||
| .second); | ||
| rpc::Address actor_local_raylet_address; | ||
| actor_local_raylet_address.set_node_id(node->node_id()); | ||
| actor_local_raylet_address.set_ip_address(node->node_manager_address()); | ||
| actor_local_raylet_address.set_port(node->node_manager_port()); | ||
| actor->UpdateLocalRayletAddress(actor_local_raylet_address); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. on actor death in between restarting, there's probably a point where the actor doesn't have a local raylet. local_raylet_address should probably be an optional and we shouldn't make the rpc if it's nullopt
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ahhh good catch thanks, yup I modified it so that we don't make the rpc if its nullopt. |
||
| actor->UpdateAddress(leased_worker->GetAddress()); | ||
| actor->GetMutableActorTableData()->set_pid(reply.worker_pid()); | ||
| actor->GetMutableTaskSpec()->set_lease_grant_timestamp_ms(current_sys_time_ms()); | ||
|
|
@@ -491,7 +498,10 @@ void GcsActorScheduler::CreateActorOnWorker(std::shared_ptr<GcsActor> actor, | |
| << " has been removed from creating map. Actor status " | ||
| << actor->GetState(); | ||
| auto actor_id = status.ok() ? actor->GetActorID() : ActorID::Nil(); | ||
| KillActorOnWorker(worker->GetAddress(), actor_id); | ||
| if (actor->LocalRayletAddress().has_value()) { | ||
| KillActorOnWorker( | ||
| actor->LocalRayletAddress().value(), worker->GetAddress(), actor_id); | ||
| } | ||
| } | ||
| }); | ||
| } | ||
|
|
@@ -521,27 +531,27 @@ void GcsActorScheduler::DoRetryCreatingActorOnWorker( | |
| } | ||
| } | ||
|
|
||
| std::shared_ptr<RayletClientInterface> GcsActorScheduler::GetOrConnectRayletClient( | ||
| const rpc::Address &raylet_address) { | ||
| return raylet_client_pool_.GetOrConnectByAddress(raylet_address); | ||
| } | ||
|
|
||
| bool GcsActorScheduler::KillActorOnWorker(const rpc::Address &worker_address, | ||
| bool GcsActorScheduler::KillActorOnWorker(const rpc::Address &raylet_address, | ||
| const rpc::Address &worker_address, | ||
| ActorID actor_id) { | ||
| if (worker_address.node_id().empty()) { | ||
| RAY_LOG(DEBUG) << "Invalid worker address, skip the killing of actor " << actor_id; | ||
| if (raylet_address.node_id().empty() || worker_address.node_id().empty()) { | ||
| RAY_LOG(DEBUG) << "Invalid raylet or worker address, skip the killing of actor " | ||
| << actor_id; | ||
| return false; | ||
| } | ||
|
|
||
| auto cli = worker_client_pool_.GetOrConnect(worker_address); | ||
| rpc::KillActorRequest request; | ||
| auto raylet_client = raylet_client_pool_.GetOrConnectByAddress(raylet_address); | ||
| rpc::KillLocalActorRequest request; | ||
Sparks0219 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // Set it to be Nil() since it hasn't been setup yet. | ||
| request.set_intended_actor_id(actor_id.Binary()); | ||
| request.set_worker_id(worker_address.worker_id()); | ||
| request.set_force_kill(true); | ||
| cli->KillActor(request, [actor_id](auto &status, auto &&) { | ||
| RAY_LOG(DEBUG) << "Killing actor " << actor_id | ||
| << " with return status: " << status.ToString(); | ||
| }); | ||
|
|
||
| raylet_client->KillLocalActor( | ||
| request, [actor_id](const Status &status, const rpc::KillLocalActorReply &) { | ||
| RAY_LOG(DEBUG) << "Killing actor " << actor_id | ||
| << " with return status: " << status.ToString(); | ||
|
||
| }); | ||
| return true; | ||
| } | ||
|
|
||
|
|
@@ -579,7 +589,9 @@ void GcsActorScheduler::HandleWorkerLeaseReply( | |
| // If the actor has been killed, we need to kill the worker too | ||
| // otherwise, the worker will be leaked. | ||
| RAY_LOG(DEBUG) << "Actor " << actor->GetActorID() << " is dead, kill the worker."; | ||
| KillActorOnWorker(reply.worker_address(), ActorID::Nil()); | ||
| auto raylet_address = rpc::RayletClientPool::GenerateRayletAddress( | ||
| node_id, node->node_manager_address(), node->node_manager_port()); | ||
| KillActorOnWorker(raylet_address, reply.worker_address(), ActorID::Nil()); | ||
| } | ||
| return; | ||
| } | ||
|
|
@@ -621,7 +633,7 @@ void GcsActorScheduler::HandleWorkerLeaseReply( | |
| RAY_LOG(INFO) << "Finished leasing worker from " << node_id << " for actor " | ||
| << actor->GetActorID() | ||
| << ", job id = " << actor->GetActorID().JobId(); | ||
| HandleWorkerLeaseGrantedReply(actor, reply); | ||
| HandleWorkerLeaseGrantedReply(actor, reply, node); | ||
| } | ||
| } else { | ||
| RetryLeasingWorkerFromNode(actor, node); | ||
|
|
@@ -630,7 +642,9 @@ void GcsActorScheduler::HandleWorkerLeaseReply( | |
| // If the actor has been killed, we need to kill the worker too | ||
| // otherwise, the worker will be leaked. | ||
| RAY_LOG(DEBUG) << "Actor " << actor->GetActorID() << " is dead, kill the worker."; | ||
| KillActorOnWorker(reply.worker_address(), ActorID::Nil()); | ||
| auto raylet_address = rpc::RayletClientPool::GenerateRayletAddress( | ||
| node_id, node->node_manager_address(), node->node_manager_port()); | ||
| KillActorOnWorker(raylet_address, reply.worker_address(), ActorID::Nil()); | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.