diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 07d4591afc1..b75500b57ea 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -194,6 +194,13 @@ func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]s } } + // Validate AutoscalerOptions.IdleTimeoutSeconds (works with both v1 and v2 autoscaler) + if spec.AutoscalerOptions != nil && spec.AutoscalerOptions.IdleTimeoutSeconds != nil { + if *spec.AutoscalerOptions.IdleTimeoutSeconds < 0 { + return fmt.Errorf("autoscalerOptions.idleTimeoutSeconds must be non-negative, got %d", *spec.AutoscalerOptions.IdleTimeoutSeconds) + } + } + if IsAuthEnabled(spec) { if spec.RayVersion == "" { return fmt.Errorf("authOptions.mode is 'token' but RayVersion was not specified. Ray version 2.52.0 or later is required") @@ -603,21 +610,23 @@ func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error { // validateWorkerGroupIdleTimeout validates the idleTimeoutSeconds field in a worker group spec func validateWorkerGroupIdleTimeout(workerGroup rayv1.WorkerGroupSpec, spec *rayv1.RayClusterSpec) error { idleTimeoutSeconds := workerGroup.IdleTimeoutSeconds - if idleTimeoutSeconds != nil { - if *idleTimeoutSeconds < 0 { - return fmt.Errorf("idleTimeoutSeconds must be non-negative, got %d", *idleTimeoutSeconds) - } + if idleTimeoutSeconds == nil { + return nil + } - // idleTimeoutSeconds only allowed on autoscaler v2 - if IsAutoscalingV2Enabled(spec) { - return nil - } - envVar, exists := EnvVarByName(RAY_ENABLE_AUTOSCALER_V2, spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env) - if exists && (envVar.Value == "1" || envVar.Value == "true") { - return nil - } - return fmt.Errorf("worker group %s has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", workerGroup.GroupName, RAY_ENABLE_AUTOSCALER_V2) + if *idleTimeoutSeconds < 0 { + return fmt.Errorf("worker group %s: idleTimeoutSeconds must be non-negative, got %d", workerGroup.GroupName, *idleTimeoutSeconds) } - return nil + // WorkerGroupSpec.idleTimeoutSeconds only allowed on autoscaler v2 + if IsAutoscalingV2Enabled(spec) { + return nil + } + + envVar, exists := EnvVarByName(RAY_ENABLE_AUTOSCALER_V2, spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env) + if exists && (envVar.Value == "1" || envVar.Value == "true") { + return nil + } + + return fmt.Errorf("worker group %s: idleTimeoutSeconds is set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", workerGroup.GroupName, RAY_ENABLE_AUTOSCALER_V2) } diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index 43fa6af48e9..d575ef9cee3 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1883,20 +1883,29 @@ func TestValidateClusterUpgradeOptions(t *testing.T) { } } -func TestValidateWorkerGroupIdleTimeout(t *testing.T) { +func TestValidateRayClusterSpec_IdleTimeoutSeconds(t *testing.T) { + // Util function to create a RayCluster spec. + createSpec := func() rayv1.RayClusterSpec { + return rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + } + } + tests := map[string]struct { expectedErr string spec rayv1.RayClusterSpec }{ - "should accept worker group with valid idleTimeoutSeconds": { - spec: rayv1.RayClusterSpec{ - EnableInTreeAutoscaling: ptr.To(true), - HeadGroupSpec: rayv1.HeadGroupSpec{ - Template: podTemplateSpec([]corev1.EnvVar{ - {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"}, - }, nil), - }, - WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + // Worker group tests + "Valid: Worker group with valid idleTimeoutSeconds and v2 spec field": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.AutoscalerOptions = &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV2), + } + s.WorkerGroupSpecs = []rayv1.WorkerGroupSpec{ { GroupName: "worker-group-1", Template: podTemplateSpec(nil, nil), @@ -1904,39 +1913,37 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) { MinReplicas: ptr.To(int32(0)), MaxReplicas: ptr.To(int32(10)), }, - }, - }, + } + return s + }(), expectedErr: "", }, - "should reject negative idleTimeoutSeconds": { - spec: rayv1.RayClusterSpec{ - EnableInTreeAutoscaling: ptr.To(true), - HeadGroupSpec: rayv1.HeadGroupSpec{ - Template: podTemplateSpec([]corev1.EnvVar{ - {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"}, - }, nil), - }, - WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + "Valid: Worker group with idleTimeoutSeconds and v2 env var": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.HeadGroupSpec.Template = podTemplateSpec([]corev1.EnvVar{ + {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"}, + }, nil) + s.WorkerGroupSpecs = []rayv1.WorkerGroupSpec{ { GroupName: "worker-group-1", Template: podTemplateSpec(nil, nil), - IdleTimeoutSeconds: ptr.To(int32(-10)), + IdleTimeoutSeconds: ptr.To(int32(60)), MinReplicas: ptr.To(int32(0)), MaxReplicas: ptr.To(int32(10)), }, - }, - }, - expectedErr: "idleTimeoutSeconds must be non-negative, got -10", + } + return s + }(), + expectedErr: "", }, - "should accept zero idleTimeoutSeconds": { - spec: rayv1.RayClusterSpec{ - EnableInTreeAutoscaling: ptr.To(true), - HeadGroupSpec: rayv1.HeadGroupSpec{ - Template: podTemplateSpec([]corev1.EnvVar{ - {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"}, - }, nil), - }, - WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + "Valid: Worker group with zero idleTimeoutSeconds": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.AutoscalerOptions = &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV2), + } + s.WorkerGroupSpecs = []rayv1.WorkerGroupSpec{ { GroupName: "worker-group-1", Template: podTemplateSpec(nil, nil), @@ -1944,73 +1951,34 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) { MinReplicas: ptr.To(int32(0)), MaxReplicas: ptr.To(int32(10)), }, - }, - }, + } + return s + }(), expectedErr: "", }, - "should reject idleTimeoutSeconds when autoscaler version is not v2": { - spec: rayv1.RayClusterSpec{ - EnableInTreeAutoscaling: ptr.To(true), - HeadGroupSpec: rayv1.HeadGroupSpec{ - Template: podTemplateSpec(nil, nil), - }, - WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ - { - GroupName: "worker-group-1", - Template: podTemplateSpec(nil, nil), - IdleTimeoutSeconds: ptr.To(int32(60)), - MinReplicas: ptr.To(int32(0)), - MaxReplicas: ptr.To(int32(10)), - }, - }, - }, - expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", RAY_ENABLE_AUTOSCALER_V2), - }, - "should reject idleTimeoutSeconds when autoscaler version is not set": { - spec: rayv1.RayClusterSpec{ - EnableInTreeAutoscaling: ptr.To(true), - HeadGroupSpec: rayv1.HeadGroupSpec{ - Template: podTemplateSpec(nil, nil), - }, - WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ - { - GroupName: "worker-group-1", - Template: podTemplateSpec(nil, nil), - IdleTimeoutSeconds: ptr.To(int32(60)), - MinReplicas: ptr.To(int32(0)), - MaxReplicas: ptr.To(int32(10)), - }, - }, - }, - expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", RAY_ENABLE_AUTOSCALER_V2), - }, - "should reject idleTimeoutSeconds when AutoscalerOptions is nil": { - spec: rayv1.RayClusterSpec{ - EnableInTreeAutoscaling: ptr.To(true), - HeadGroupSpec: rayv1.HeadGroupSpec{ - Template: podTemplateSpec(nil, nil), - }, - WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + "Invalid: Worker group with negative idleTimeoutSeconds": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.AutoscalerOptions = &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV2), + } + s.WorkerGroupSpecs = []rayv1.WorkerGroupSpec{ { GroupName: "worker-group-1", Template: podTemplateSpec(nil, nil), - IdleTimeoutSeconds: ptr.To(int32(60)), + IdleTimeoutSeconds: ptr.To(int32(-10)), MinReplicas: ptr.To(int32(0)), MaxReplicas: ptr.To(int32(10)), }, - }, - }, - expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", RAY_ENABLE_AUTOSCALER_V2), + } + return s + }(), + expectedErr: "worker group worker-group-1: idleTimeoutSeconds must be non-negative, got -10", }, - "should reject idleTimeoutSeconds when env var is set to invalid value": { - spec: rayv1.RayClusterSpec{ - EnableInTreeAutoscaling: ptr.To(true), - HeadGroupSpec: rayv1.HeadGroupSpec{ - Template: podTemplateSpec([]corev1.EnvVar{ - {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "false"}, - }, nil), - }, - WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + "Invalid: Worker group idleTimeoutSeconds without v2": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.WorkerGroupSpecs = []rayv1.WorkerGroupSpec{ { GroupName: "worker-group-1", Template: podTemplateSpec(nil, nil), @@ -2018,19 +1986,18 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) { MinReplicas: ptr.To(int32(0)), MaxReplicas: ptr.To(int32(10)), }, - }, - }, - expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", RAY_ENABLE_AUTOSCALER_V2), + } + return s + }(), + expectedErr: fmt.Sprintf("worker group worker-group-1: idleTimeoutSeconds is set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", RAY_ENABLE_AUTOSCALER_V2), }, - "should accept worker group with idleTimeoutSeconds when env var is set to true": { - spec: rayv1.RayClusterSpec{ - EnableInTreeAutoscaling: ptr.To(true), - HeadGroupSpec: rayv1.HeadGroupSpec{ - Template: podTemplateSpec([]corev1.EnvVar{ - {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "true"}, - }, nil), - }, - WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + "Invalid: Worker group idleTimeoutSeconds with invalid env var": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.HeadGroupSpec.Template = podTemplateSpec([]corev1.EnvVar{ + {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "false"}, + }, nil) + s.WorkerGroupSpecs = []rayv1.WorkerGroupSpec{ { GroupName: "worker-group-1", Template: podTemplateSpec(nil, nil), @@ -2038,25 +2005,96 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) { MinReplicas: ptr.To(int32(0)), MaxReplicas: ptr.To(int32(10)), }, - }, - }, - expectedErr: "", + } + return s + }(), + expectedErr: fmt.Sprintf("worker group worker-group-1: idleTimeoutSeconds is set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", RAY_ENABLE_AUTOSCALER_V2), }, - "should accept worker group without idleTimeoutSeconds and without autoscaler v2": { - spec: rayv1.RayClusterSpec{ - EnableInTreeAutoscaling: ptr.To(true), - HeadGroupSpec: rayv1.HeadGroupSpec{ - Template: podTemplateSpec(nil, nil), - }, - WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + "Valid: Worker group without idleTimeoutSeconds": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.WorkerGroupSpecs = []rayv1.WorkerGroupSpec{ { GroupName: "worker-group-1", Template: podTemplateSpec(nil, nil), MinReplicas: ptr.To(int32(0)), MaxReplicas: ptr.To(int32(10)), }, - }, - }, + } + return s + }(), + expectedErr: "", + }, + + // AutoscalerOptions tests (idleTimeoutSeconds works with both v1 and v2) + "Valid: AutoscalerOptions with idleTimeoutSeconds and v2": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.AutoscalerOptions = &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV2), + IdleTimeoutSeconds: ptr.To(int32(120)), + } + return s + }(), + expectedErr: "", + }, + "Valid: AutoscalerOptions with idleTimeoutSeconds and v1": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.AutoscalerOptions = &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV1), + IdleTimeoutSeconds: ptr.To(int32(120)), + } + return s + }(), + expectedErr: "", + }, + "Valid: AutoscalerOptions with idleTimeoutSeconds and no version": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.AutoscalerOptions = &rayv1.AutoscalerOptions{ + IdleTimeoutSeconds: ptr.To(int32(120)), + } + return s + }(), + expectedErr: "", + }, + "Valid: AutoscalerOptions with zero idleTimeoutSeconds": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.AutoscalerOptions = &rayv1.AutoscalerOptions{ + IdleTimeoutSeconds: ptr.To(int32(0)), + } + return s + }(), + expectedErr: "", + }, + "Invalid: AutoscalerOptions with negative idleTimeoutSeconds": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.AutoscalerOptions = &rayv1.AutoscalerOptions{ + IdleTimeoutSeconds: ptr.To(int32(-10)), + } + return s + }(), + expectedErr: "autoscalerOptions.idleTimeoutSeconds must be non-negative, got -10", + }, + "Valid: AutoscalerOptions without idleTimeoutSeconds": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.AutoscalerOptions = &rayv1.AutoscalerOptions{ + Version: ptr.To(rayv1.AutoscalerVersionV1), + } + return s + }(), + expectedErr: "", + }, + "Valid: AutoscalerOptions is nil": { + spec: func() rayv1.RayClusterSpec { + s := createSpec() + s.AutoscalerOptions = nil + return s + }(), expectedErr: "", }, }