diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 50dece986da..ba34f569079 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -240,8 +240,18 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo } func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType rayv1.RayNodeType, creatorCRDType utils.CRDType) { - rayAgentRayletHealthCommand := fmt.Sprintf(utils.BaseWgetHealthCommand, utils.DefaultDashboardAgentListenPort, utils.RayAgentRayletHealthPath) - rayDashboardGCSHealthCommand := fmt.Sprintf(utils.BaseWgetHealthCommand, utils.DefaultDashboardPort, utils.RayDashboardGCSHealthPath) + rayAgentRayletHealthCommand := fmt.Sprintf( + utils.BaseWgetHealthCommand, + utils.DefaultReadinessProbeTimeoutSeconds, + utils.DefaultDashboardAgentListenPort, + utils.RayAgentRayletHealthPath, + ) + rayDashboardGCSHealthCommand := fmt.Sprintf( + utils.BaseWgetHealthCommand, + utils.DefaultReadinessProbeFailureThreshold, + utils.DefaultDashboardPort, + utils.RayDashboardGCSHealthPath, + ) // Generally, the liveness and readiness probes perform the same checks. // For head node => Check GCS and Raylet status. @@ -279,8 +289,12 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r // See https://github.com/ray-project/kuberay/pull/1808 for reasons. if creatorCRDType == utils.RayServiceCRD && rayNodeType == rayv1.WorkerNode { rayContainer.ReadinessProbe.FailureThreshold = utils.ServeReadinessProbeFailureThreshold - rayServeProxyHealthCommand := fmt.Sprintf(utils.BaseWgetHealthCommand, - utils.FindContainerPort(rayContainer, utils.ServingPortName, utils.DefaultServingPort), utils.RayServeProxyHealthPath) + rayServeProxyHealthCommand := fmt.Sprintf( + utils.BaseWgetHealthCommand, + utils.DefaultReadinessProbeInitialDelaySeconds, + utils.FindContainerPort(rayContainer, utils.ServingPortName, utils.DefaultServingPort), + utils.RayServeProxyHealthPath, + ) commands = append(commands, rayServeProxyHealthCommand) rayContainer.ReadinessProbe.Exec = &corev1.ExecAction{Command: []string{"bash", "-c", strings.Join(commands, " && ")}} } diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index fe7ab4c9522..5a487d0f730 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -148,7 +148,7 @@ const ( LOCAL_HOST = "127.0.0.1" // Ray FT default readiness probe values DefaultReadinessProbeInitialDelaySeconds = 10 - DefaultReadinessProbeTimeoutSeconds = 1 + DefaultReadinessProbeTimeoutSeconds = 2 DefaultReadinessProbePeriodSeconds = 5 DefaultReadinessProbeSuccessThreshold = 1 DefaultReadinessProbeFailureThreshold = 10 @@ -156,7 +156,7 @@ const ( // Ray FT default liveness probe values DefaultLivenessProbeInitialDelaySeconds = 30 - DefaultLivenessProbeTimeoutSeconds = 1 + DefaultLivenessProbeTimeoutSeconds = 2 DefaultLivenessProbePeriodSeconds = 5 DefaultLivenessProbeSuccessThreshold = 1 DefaultLivenessProbeFailureThreshold = 120 @@ -169,7 +169,7 @@ const ( RayAgentRayletHealthPath = "api/local_raylet_healthz" RayDashboardGCSHealthPath = "api/gcs_healthz" RayServeProxyHealthPath = "-/healthz" - BaseWgetHealthCommand = "wget -T 2 -q -O- http://localhost:%d/%s | grep success" + BaseWgetHealthCommand = "wget -T %d -q -O- http://localhost:%d/%s | grep success" // Finalizers for RayJob RayJobStopJobFinalizer = "ray.io/rayjob-finalizer"