diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 18b57f19922..7de38e35271 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -621,6 +621,17 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv } else if len(headPods.Items) == 0 { originatedFrom := utils.GetCRDType(instance.Labels[utils.RayOriginatedFromCRDLabelKey]) if originatedFrom == utils.RayJobCRD { + // Recreating the head Pod if the RayCluster created by RayJob is provisioned doesn't help RayJob. + // + // Case 1: GCS fault tolerance is disabled + // + // In this case, the worker Pods will be killed by the new head Pod when it is created, so the new Ray job will not be running in + // a "provisioned" cluster. + // + // Case 2: GCS fault tolerance is enabled + // + // In this case, the worker Pods will not be killed by the new head Pod when it is created, but the submission ID has already been + // used by the old Ray job, so the new Ray job will fail. if meta.IsStatusConditionTrue(instance.Status.Conditions, string(rayv1.RayClusterProvisioned)) { logger.Info( "reconcilePods: Found 0 head Pods for a RayJob-managed RayCluster; skipping head creation to let RayJob controller handle the failure",