@@ -546,21 +546,15 @@ bool LocalTaskManager::PoppedWorkerHandler(
546546 not_detached_with_owner_failed = true ;
547547 }
548548
549- const auto &required_resource =
550- task.GetTaskSpecification ().GetRequiredResources ().GetResourceMap ();
551- for (auto &entry : required_resource) {
552- if (!cluster_resource_scheduler_->GetLocalResourceManager ().ResourcesExist (
553- scheduling::ResourceID (entry.first ))) {
554- RAY_CHECK (task.GetTaskSpecification ().PlacementGroupBundleId ().first !=
555- PlacementGroupID::Nil ());
556- RAY_LOG (DEBUG) << " The placement group: "
557- << task.GetTaskSpecification ().PlacementGroupBundleId ().first
558- << " was removed when poping workers for task: " << task_id
559- << " , will cancel the task." ;
560- CancelTask (
561- task_id,
562- rpc::RequestWorkerLeaseReply::SCHEDULING_CANCELLED_PLACEMENT_GROUP_REMOVED);
563- canceled = true ;
549+ if (!canceled) {
550+ const auto &required_resource =
551+ task.GetTaskSpecification ().GetRequiredResources ().GetResourceMap ();
552+ for (auto &entry : required_resource) {
553+ // This is to make sure PG resource is not deleted during popping worker
554+ // unless the lease request is cancelled.
555+ RAY_CHECK (cluster_resource_scheduler_->GetLocalResourceManager ().ResourcesExist (
556+ scheduling::ResourceID (entry.first )))
557+ << entry.first ;
564558 }
565559 }
566560
@@ -855,7 +849,7 @@ void LocalTaskManager::ReleaseTaskArgs(const TaskID &task_id) {
855849}
856850
857851namespace {
858- void ReplyCancelled (std::shared_ptr<internal::Work> &work,
852+ void ReplyCancelled (const std::shared_ptr<internal::Work> &work,
859853 rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type,
860854 const std::string &scheduling_failure_message) {
861855 auto reply = work->reply ;
@@ -867,55 +861,67 @@ void ReplyCancelled(std::shared_ptr<internal::Work> &work,
867861}
868862} // namespace
869863
870- bool LocalTaskManager::CancelTask (
871- const TaskID &task_id ,
864+ bool LocalTaskManager::CancelTasks (
865+ std::function< bool ( const std::shared_ptr<internal::Work> &)> predicate ,
872866 rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type,
873867 const std::string &scheduling_failure_message) {
874- for (auto shapes_it = tasks_to_dispatch_.begin (); shapes_it != tasks_to_dispatch_.end ();
875- shapes_it++) {
876- auto &work_queue = shapes_it->second ;
877- for (auto work_it = work_queue.begin (); work_it != work_queue.end (); work_it++) {
878- const auto &task = (*work_it)->task ;
879- if (task.GetTaskSpecification ().TaskId () == task_id) {
880- RAY_LOG (DEBUG) << " Canceling task " << task_id << " from dispatch queue." ;
881- ReplyCancelled (*work_it, failure_type, scheduling_failure_message);
882- if ((*work_it)->GetState () == internal::WorkStatus::WAITING_FOR_WORKER) {
883- // We've already acquired resources so we need to release them.
884- cluster_resource_scheduler_->GetLocalResourceManager ().ReleaseWorkerResources (
885- (*work_it)->allocated_instances );
886- // Release pinned task args.
887- ReleaseTaskArgs (task_id);
888- }
889- if (!task.GetTaskSpecification ().GetDependencies ().empty ()) {
890- task_dependency_manager_.RemoveTaskDependencies (
891- task.GetTaskSpecification ().TaskId ());
868+ bool tasks_cancelled = false ;
869+
870+ ray::erase_if<SchedulingClass, std::shared_ptr<internal::Work>>(
871+ tasks_to_dispatch_, [&](const std::shared_ptr<internal::Work> &work) {
872+ if (predicate (work)) {
873+ const TaskID task_id = work->task .GetTaskSpecification ().TaskId ();
874+ RAY_LOG (DEBUG) << " Canceling task " << task_id << " from dispatch queue." ;
875+ ReplyCancelled (work, failure_type, scheduling_failure_message);
876+ if (work->GetState () == internal::WorkStatus::WAITING_FOR_WORKER) {
877+ // We've already acquired resources so we need to release them.
878+ cluster_resource_scheduler_->GetLocalResourceManager ().ReleaseWorkerResources (
879+ work->allocated_instances );
880+ // Release pinned task args.
881+ ReleaseTaskArgs (task_id);
882+ }
883+ if (!work->task .GetTaskSpecification ().GetDependencies ().empty ()) {
884+ task_dependency_manager_.RemoveTaskDependencies (
885+ work->task .GetTaskSpecification ().TaskId ());
886+ }
887+ RemoveFromRunningTasksIfExists (work->task );
888+ work->SetStateCancelled ();
889+ tasks_cancelled = true ;
890+ return true ;
891+ } else {
892+ return false ;
892893 }
893- RemoveFromRunningTasksIfExists (task);
894- (*work_it)->SetStateCancelled ();
895- work_queue.erase (work_it);
896- if (work_queue.empty ()) {
897- tasks_to_dispatch_.erase (shapes_it);
894+ });
895+
896+ ray::erase_if<std::shared_ptr<internal::Work>>(
897+ waiting_task_queue_, [&](const std::shared_ptr<internal::Work> &work) {
898+ if (predicate (work)) {
899+ ReplyCancelled (work, failure_type, scheduling_failure_message);
900+ if (!work->task .GetTaskSpecification ().GetDependencies ().empty ()) {
901+ task_dependency_manager_.RemoveTaskDependencies (
902+ work->task .GetTaskSpecification ().TaskId ());
903+ }
904+ waiting_tasks_index_.erase (work->task .GetTaskSpecification ().TaskId ());
905+ tasks_cancelled = true ;
906+ return true ;
907+ } else {
908+ return false ;
898909 }
899- return true ;
900- }
901- }
902- }
910+ });
903911
904- auto iter = waiting_tasks_index_.find (task_id);
905- if (iter != waiting_tasks_index_.end ()) {
906- const auto &task = (*iter->second )->task ;
907- ReplyCancelled (*iter->second , failure_type, scheduling_failure_message);
908- if (!task.GetTaskSpecification ().GetDependencies ().empty ()) {
909- task_dependency_manager_.RemoveTaskDependencies (
910- task.GetTaskSpecification ().TaskId ());
911- }
912- waiting_task_queue_.erase (iter->second );
913- waiting_tasks_index_.erase (iter);
914-
915- return true ;
916- }
912+ return tasks_cancelled;
913+ }
917914
918- return false ;
915+ bool LocalTaskManager::CancelTask (
916+ const TaskID &task_id,
917+ rpc::RequestWorkerLeaseReply::SchedulingFailureType failure_type,
918+ const std::string &scheduling_failure_message) {
919+ return CancelTasks (
920+ [task_id](const std::shared_ptr<internal::Work> &work) {
921+ return work->task .GetTaskSpecification ().TaskId () == task_id;
922+ },
923+ failure_type,
924+ scheduling_failure_message);
919925}
920926
921927bool LocalTaskManager::AnyPendingTasksForResourceAcquisition (
0 commit comments