@@ -545,9 +545,8 @@ void NodeManager::HandleJobFinished(const JobID &job_id, const JobTableData &job
545545 (worker->GetAssignedJobId () == job_id)) {
546546 // Don't kill worker processes belonging to the detached actor
547547 // since those are expected to outlive the job.
548- RAY_LOG (INFO).WithField (worker->WorkerId ())
549- << " The leased worker "
550- << " is killed because the job " << job_id << " finished." ;
548+ RAY_LOG (INFO).WithField (worker->WorkerId ()).WithField (job_id)
549+ << " Killing leased worker because its job finished." ;
551550 rpc::ExitRequest request;
552551 request.set_force_exit (true );
553552 worker->rpc_client ()->Exit (
@@ -948,7 +947,7 @@ void NodeManager::NodeRemoved(const NodeID &node_id) {
948947 // If the leased worker's owner was on the failed node, then kill the leased
949948 // worker.
950949 RAY_LOG (INFO).WithField (worker->WorkerId ()).WithField (owner_node_id)
951- << " The leased worker is killed because the owner node died." ;
950+ << " Killing leased worker because its owner's node died." ;
952951 worker->KillAsync (io_service_);
953952 }
954953
@@ -989,9 +988,10 @@ void NodeManager::HandleUnexpectedWorkerFailure(const WorkerID &worker_id) {
989988 continue ;
990989 }
991990 // If the failed worker was a leased worker's owner, then kill the leased worker.
992- RAY_LOG (INFO) << " The leased worker " << worker->WorkerId ()
993- << " is killed because the owner process " << owner_worker_id
994- << " died." ;
991+ RAY_LOG (INFO)
992+ .WithField (worker->WorkerId ())
993+ .WithField (" owner_worker_id" , owner_worker_id)
994+ << " Killing leased worker because its owner died." ;
995995 worker->KillAsync (io_service_);
996996 }
997997}
@@ -1053,6 +1053,8 @@ bool NodeManager::ResourceDeleted(const NodeID &node_id,
10531053void NodeManager::HandleNotifyGCSRestart (rpc::NotifyGCSRestartRequest request,
10541054 rpc::NotifyGCSRestartReply *reply,
10551055 rpc::SendReplyCallback send_reply_callback) {
1056+ RAY_LOG (INFO)
1057+ << " The GCS has restarted. Resubscribing to pubsub and notifying local workers." ;
10561058 // When GCS restarts, it'll notify raylet to do some initialization work
10571059 // (resubscribing). Raylet will also notify all workers to do this job. Workers are
10581060 // registered to raylet first (blocking call) and then connect to GCS, so there is no
@@ -1090,10 +1092,9 @@ void NodeManager::HandleClientConnectionError(
10901092 error.value (),
10911093 " . " ,
10921094 error.message (),
1093- " . There are some potential root causes. (1) The process is killed by "
1094- " SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is "
1095- " called. (3) The worker is crashed unexpectedly due to SIGSEGV or other "
1096- " unexpected errors." );
1095+ " . Some common causes include: (1) the process was killed by the OOM killer "
1096+ " due to high memory usage, (2) ray stop --force was called, or (3) the worker "
1097+ " crashed unexpectedly due to SIGSEGV or another unexpected error." );
10971098
10981099 // Disconnect the client and don't process more messages.
10991100 DisconnectClient (
@@ -1413,31 +1414,27 @@ void NodeManager::DisconnectClient(const std::shared_ptr<ClientConnection> &clie
14131414 rpc::WorkerExitType disconnect_type,
14141415 const std::string &disconnect_detail,
14151416 const rpc::RayException *creation_task_exception) {
1416- std::shared_ptr<WorkerInterface> worker = worker_pool_.GetRegisteredWorker (client);
14171417 bool is_worker = false , is_driver = false ;
1418- if ( worker) {
1419- // The client is a worker.
1418+ std::shared_ptr<WorkerInterface> worker;
1419+ if ((worker = worker_pool_. GetRegisteredWorker ( client))) {
14201420 is_worker = true ;
1421+ RAY_LOG (INFO).WithField (worker->WorkerId ()).WithField (worker->GetAssignedJobId ())
1422+ << " Disconnecting worker, graceful=" << std::boolalpha << graceful
1423+ << " , disconnect_type=" << disconnect_type
1424+ << " , has_creation_task_exception=" << std::boolalpha
1425+ << (creation_task_exception != nullptr );
1426+ } else if ((worker = worker_pool_.GetRegisteredDriver (client))) {
1427+ is_driver = true ;
1428+ RAY_LOG (INFO).WithField (worker->WorkerId ()).WithField (worker->GetAssignedJobId ())
1429+ << " Disconnecting driver, graceful=" << std::boolalpha << graceful
1430+ << " , disconnect_type=" << disconnect_type;
14211431 } else {
1422- worker = worker_pool_.GetRegisteredDriver (client);
1423- if (worker) {
1424- // The client is a driver.
1425- is_driver = true ;
1426- } else {
1427- RAY_LOG (INFO)
1428- << " Not disconnecting client disconnect it has already been disconnected." ;
1429- return ;
1430- }
1432+ RAY_LOG (INFO) << " Got disconnect message from an unregistered client, ignoring." ;
1433+ return ;
14311434 }
14321435
1433- RAY_LOG (INFO).WithField (worker->WorkerId ())
1434- << " Disconnecting client, graceful=" << std::boolalpha << graceful
1435- << " , disconnect_type=" << disconnect_type
1436- << " , has_creation_task_exception=" << std::boolalpha
1437- << (creation_task_exception != nullptr );
1436+ RAY_CHECK (is_worker != is_driver) << " Client must be a registered worker or driver." ;
14381437
1439- RAY_CHECK (worker != nullptr );
1440- RAY_CHECK (!(is_worker && is_driver));
14411438 // Clean up any open ray.get or ray.wait calls that the worker made.
14421439 lease_dependency_manager_.CancelGetRequest (worker->WorkerId ());
14431440 lease_dependency_manager_.CancelWaitRequest (worker->WorkerId ());
0 commit comments