From 5b9bc1daa6e89c3aaf78fa95a924d4fda5f9c630 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Tue, 4 Mar 2025 04:41:00 +0000 Subject: [PATCH 01/56] Add incremental upgrade API changes to KubeRay Signed-off-by: Ryan O'Leary Update go mod dependencies for gateway v1 Signed-off-by: Ryan O'Leary Add reconcile Gateway and HTTPRoute Signed-off-by: Ryan O'Leary Add TargetCapacity and TrafficRoutedPercent to RayServiceStatus Signed-off-by: Ryan O'Leary Add controller logic initial commit Signed-off-by: Ryan O'Leary Add IncrementalUpgrade check to ShouldUpdate Signed-off-by: Ryan O'Leary Update controller logic to reconcile incremental upgrade Signed-off-by: Ryan O'Leary TrafficRoutedPercent should not set default value Signed-off-by: Ryan O'Leary Remove test changes to TPU manifest Signed-off-by: Ryan O'Leary Move helper function to utils Signed-off-by: Ryan O'Leary Fix lint Signed-off-by: Ryan O'Leary Fix field alignment Signed-off-by: Ryan O'Leary Fix bad merge Signed-off-by: Ryan O'Leary Fix CRDs and add validation test case Signed-off-by: Ryan O'Leary Test create HTTPRoute and create Gateway Signed-off-by: Ryan O'Leary Add reconcile tests for Gateway and HTTPRoute Signed-off-by: Ryan O'Leary Fix lint Signed-off-by: Ryan O'Leary Add tests for util functions and fix golangci-lint Signed-off-by: Ryan O'Leary Add basic e2e test case Signed-off-by: Ryan O'Leary Fix GetGatewayListeners logic and test Signed-off-by: Ryan O'Leary Add gatewayv1 scheme to util runtime Signed-off-by: Ryan O'Leary Check if IncrementalUpgrade is enabled before checking Gateway Signed-off-by: Ryan O'Leary Fix reconcile logic for Gateway and HTTPRoute Signed-off-by: Ryan O'Leary Add feature gate Signed-off-by: Ryan O'Leary Always create Gateway and HTTPRoute for IncrementalUpgrade Signed-off-by: Ryan O'Leary Fix target_capacity reonciliation logic Signed-off-by: Ryan O'Leary Add additional unit tests Signed-off-by: Ryan O'Leary Move e2e test and add another unit test Signed-off-by: Ryan O'Leary --- docs/reference/api.md | 22 + .../crds/ray.io_rayservices.yaml | 1799 ++++++++++++++++- ray-operator/Makefile | 4 + ray-operator/apis/ray/v1/rayservice_types.go | 77 +- .../apis/ray/v1/zz_generated.deepcopy.go | 68 +- .../config/crd/bases/ray.io_rayservices.yaml | 1799 ++++++++++++++++- ray-operator/config/rbac/role.yaml | 13 + .../controllers/ray/common/association.go | 14 + .../controllers/ray/rayservice_controller.go | 481 ++++- .../ray/rayservice_controller_unit_test.go | 681 +++++++ .../controllers/ray/utils/consistency.go | 15 + .../controllers/ray/utils/constant.go | 8 + .../ray/utils/fake_serve_httpclient.go | 10 +- ray-operator/controllers/ray/utils/util.go | 79 + .../controllers/ray/utils/util_test.go | 290 +++ .../controllers/ray/utils/validation.go | 41 +- .../controllers/ray/utils/validation_test.go | 105 + ray-operator/main.go | 5 + .../ray/v1/incrementalupgradeoptions.go | 50 + .../ray/v1/rayservicespec.go | 19 + .../ray/v1/rayservicestatus.go | 37 +- .../ray/v1/rayservicestatuses.go | 62 +- .../ray/v1/rayserviceupgradestrategy.go | 11 +- .../pkg/client/applyconfiguration/utils.go | 2 + ray-operator/pkg/features/features.go | 12 +- .../rayservice_incremental_upgrade_test.go | 157 ++ .../test/e2eincrementalupgrade/support.go | 88 + ray-operator/test/support/client.go | 13 + ray-operator/test/support/ray.go | 21 + 29 files changed, 5888 insertions(+), 95 deletions(-) create mode 100644 ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go create mode 100644 ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go create mode 100644 ray-operator/test/e2eincrementalupgrade/support.go diff --git a/docs/reference/api.md b/docs/reference/api.md index c7d9e46ffda..b367e562ea6 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -199,6 +199,25 @@ _Appears in:_ | `serviceType` _[ServiceType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#servicetype-v1-core)_ | ServiceType is Kubernetes service type of the head service. it will be used by the workers to connect to the head pod | | | +#### IncrementalUpgradeOptions + + + + + + + +_Appears in:_ +- [RayServiceUpgradeStrategy](#rayserviceupgradestrategy) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `maxSurgePercent` _integer_ | The capacity of serve requests the upgraded cluster should scale to handle each interval.
Defaults to 100%. | 100 | | +| `stepSizePercent` _integer_ | The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent. | | | +| `intervalSeconds` _integer_ | The interval in seconds between transferring StepSize traffic from the old to new RayCluster. | | | +| `gatewayClassName` _string_ | The name of the Gateway Class installed by the Kubernetes Cluster admin. | | | + + #### JobSubmissionMode @@ -355,6 +374,8 @@ _Appears in:_ | `serviceUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | | | `deploymentUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | | | `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | | +| `gateway` _[Gateway](#gateway)_ | Gateway is the Gateway object for the RayService to serve traffics during an IncrementalUpgrade. | | | +| `httpRoute` _[HTTPRoute](#httproute)_ | HTTPRoute is the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade. | | | | `upgradeStrategy` _[RayServiceUpgradeStrategy](#rayserviceupgradestrategy)_ | UpgradeStrategy defines the scaling policy used when upgrading the RayService. | | | | `serveConfigV2` _string_ | Important: Run "make" to regenerate code after modifying this file
Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. | | | | `rayClusterConfig` _[RayClusterSpec](#rayclusterspec)_ | | | | @@ -377,6 +398,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | | +| `incrementalUpgradeOptions` _[IncrementalUpgradeOptions](#incrementalupgradeoptions)_ | IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade. | | | #### RayServiceUpgradeType diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index e2d61172a3c..02e449d4726 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -40,6 +40,1765 @@ spec: type: integer excludeHeadPodFromServeSvc: type: boolean + gateway: + properties: + apiVersion: + type: string + kind: + type: string + metadata: + properties: + annotations: + additionalProperties: + type: string + type: object + finalizers: + items: + type: string + type: array + labels: + additionalProperties: + type: string + type: object + name: + type: string + namespace: + type: string + type: object + spec: + properties: + addresses: + items: + properties: + type: + default: IPAddress + maxLength: 253 + minLength: 1 + pattern: ^Hostname|IPAddress|NamedAddress|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ + type: string + value: + maxLength: 253 + minLength: 1 + type: string + required: + - value + type: object + x-kubernetes-validations: + - message: Hostname value must only contain valid characters + (matching ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$) + rule: 'self.type == ''Hostname'' ? self.value.matches(r"""^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"""): + true' + maxItems: 16 + type: array + x-kubernetes-validations: + - message: IPAddress values must be unique + rule: 'self.all(a1, a1.type == ''IPAddress'' ? self.exists_one(a2, + a2.type == a1.type && a2.value == a1.value) : true )' + - message: Hostname values must be unique + rule: 'self.all(a1, a1.type == ''Hostname'' ? self.exists_one(a2, + a2.type == a1.type && a2.value == a1.value) : true )' + backendTLS: + properties: + clientCertificateRef: + properties: + group: + default: "" + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Secret + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + required: + - name + type: object + type: object + gatewayClassName: + maxLength: 253 + minLength: 1 + type: string + infrastructure: + properties: + annotations: + additionalProperties: + maxLength: 4096 + minLength: 0 + type: string + maxProperties: 8 + type: object + x-kubernetes-validations: + - message: Annotation keys must be in the form of an optional + DNS subdomain prefix followed by a required name segment + of up to 63 characters. + rule: self.all(key, key.matches(r"""^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$""")) + - message: If specified, the annotation key's prefix must + be a DNS subdomain not longer than 253 characters + in total. + rule: self.all(key, key.split("/")[0].size() < 253) + labels: + additionalProperties: + maxLength: 63 + minLength: 0 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + type: string + maxProperties: 8 + type: object + x-kubernetes-validations: + - message: Label keys must be in the form of an optional + DNS subdomain prefix followed by a required name segment + of up to 63 characters. + rule: self.all(key, key.matches(r"""^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$""")) + - message: If specified, the label key's prefix must be + a DNS subdomain not longer than 253 characters in + total. + rule: self.all(key, key.split("/")[0].size() < 253) + parametersRef: + properties: + group: + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + required: + - group + - kind + - name + type: object + type: object + listeners: + items: + properties: + allowedRoutes: + default: + namespaces: + from: Same + properties: + kinds: + items: + properties: + group: + default: gateway.networking.k8s.io + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + required: + - kind + type: object + maxItems: 8 + type: array + namespaces: + default: + from: Same + properties: + from: + default: Same + enum: + - All + - Selector + - Same + type: string + selector: + properties: + matchExpressions: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + type: object + type: object + x-kubernetes-map-type: atomic + type: object + type: object + hostname: + maxLength: 253 + minLength: 1 + pattern: ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + name: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + protocol: + maxLength: 255 + minLength: 1 + pattern: ^[a-zA-Z0-9]([-a-zA-Z0-9]*[a-zA-Z0-9])?$|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9]+$ + type: string + tls: + properties: + certificateRefs: + items: + properties: + group: + default: "" + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Secret + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + required: + - name + type: object + maxItems: 64 + type: array + frontendValidation: + properties: + caCertificateRefs: + items: + properties: + group: + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + required: + - group + - kind + - name + type: object + maxItems: 8 + minItems: 1 + type: array + type: object + mode: + default: Terminate + enum: + - Terminate + - Passthrough + type: string + options: + additionalProperties: + maxLength: 4096 + minLength: 0 + type: string + maxProperties: 16 + type: object + type: object + x-kubernetes-validations: + - message: certificateRefs or options must be specified + when mode is Terminate + rule: 'self.mode == ''Terminate'' ? size(self.certificateRefs) + > 0 || size(self.options) > 0 : true' + required: + - name + - port + - protocol + type: object + maxItems: 64 + minItems: 1 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + x-kubernetes-validations: + - message: tls must not be specified for protocols ['HTTP', + 'TCP', 'UDP'] + rule: 'self.all(l, l.protocol in [''HTTP'', ''TCP'', ''UDP''] + ? !has(l.tls) : true)' + - message: tls mode must be Terminate for protocol HTTPS + rule: 'self.all(l, (l.protocol == ''HTTPS'' && has(l.tls)) + ? (l.tls.mode == '''' || l.tls.mode == ''Terminate'') + : true)' + - message: hostname must not be specified for protocols ['TCP', + 'UDP'] + rule: 'self.all(l, l.protocol in [''TCP'', ''UDP''] ? (!has(l.hostname) + || l.hostname == '''') : true)' + - message: Listener name must be unique within the Gateway + rule: self.all(l1, self.exists_one(l2, l1.name == l2.name)) + - message: Combination of port, protocol and hostname must + be unique for each listener + rule: 'self.all(l1, self.exists_one(l2, l1.port == l2.port + && l1.protocol == l2.protocol && (has(l1.hostname) && + has(l2.hostname) ? l1.hostname == l2.hostname : !has(l1.hostname) + && !has(l2.hostname))))' + required: + - gatewayClassName + - listeners + type: object + status: + default: + conditions: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Accepted + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Programmed + properties: + addresses: + items: + properties: + type: + default: IPAddress + maxLength: 253 + minLength: 1 + pattern: ^Hostname|IPAddress|NamedAddress|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ + type: string + value: + maxLength: 253 + minLength: 1 + type: string + required: + - value + type: object + x-kubernetes-validations: + - message: Hostname value must only contain valid characters + (matching ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$) + rule: 'self.type == ''Hostname'' ? self.value.matches(r"""^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"""): + true' + maxItems: 16 + type: array + conditions: + default: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Accepted + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Programmed + items: + properties: + lastTransitionTime: + format: date-time + type: string + message: + maxLength: 32768 + type: string + observedGeneration: + format: int64 + minimum: 0 + type: integer + reason: + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + enum: + - "True" + - "False" + - Unknown + type: string + type: + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + listeners: + items: + properties: + attachedRoutes: + format: int32 + type: integer + conditions: + items: + properties: + lastTransitionTime: + format: date-time + type: string + message: + maxLength: 32768 + type: string + observedGeneration: + format: int64 + minimum: 0 + type: integer + reason: + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + enum: + - "True" + - "False" + - Unknown + type: string + type: + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + name: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + supportedKinds: + items: + properties: + group: + default: gateway.networking.k8s.io + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + required: + - kind + type: object + maxItems: 8 + type: array + required: + - attachedRoutes + - conditions + - name + - supportedKinds + type: object + maxItems: 64 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + httpRoute: + properties: + apiVersion: + type: string + kind: + type: string + metadata: + properties: + annotations: + additionalProperties: + type: string + type: object + finalizers: + items: + type: string + type: array + labels: + additionalProperties: + type: string + type: object + name: + type: string + namespace: + type: string + type: object + spec: + properties: + hostnames: + items: + maxLength: 253 + minLength: 1 + pattern: ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + maxItems: 16 + type: array + parentRefs: + items: + properties: + group: + default: gateway.networking.k8s.io + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Gateway + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + sectionName: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + required: + - name + type: object + maxItems: 32 + type: array + rules: + default: + - matches: + - path: + type: PathPrefix + value: / + items: + properties: + backendRefs: + items: + properties: + filters: + items: + properties: + extensionRef: + properties: + group: + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + required: + - group + - kind + - name + type: object + requestHeaderModifier: + properties: + add: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + remove: + items: + type: string + maxItems: 16 + type: array + x-kubernetes-list-type: set + set: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + requestMirror: + properties: + backendRef: + properties: + group: + default: "" + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Service + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - name + type: object + x-kubernetes-validations: + - message: Must have port for Service + reference + rule: '(size(self.group) == 0 && self.kind + == ''Service'') ? has(self.port) + : true' + fraction: + properties: + denominator: + default: 100 + format: int32 + minimum: 1 + type: integer + numerator: + format: int32 + minimum: 0 + type: integer + required: + - numerator + type: object + x-kubernetes-validations: + - message: numerator must be less than + or equal to denominator + rule: self.numerator <= self.denominator + percent: + format: int32 + maximum: 100 + minimum: 0 + type: integer + required: + - backendRef + type: object + requestRedirect: + properties: + hostname: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + path: + properties: + replaceFullPath: + maxLength: 1024 + type: string + replacePrefixMatch: + maxLength: 1024 + type: string + type: + enum: + - ReplaceFullPath + - ReplacePrefixMatch + type: string + required: + - type + type: object + x-kubernetes-validations: + - message: replaceFullPath must be specified + when type is set to 'ReplaceFullPath' + rule: 'self.type == ''ReplaceFullPath'' + ? has(self.replaceFullPath) : true' + - message: type must be 'ReplaceFullPath' + when replaceFullPath is set + rule: 'has(self.replaceFullPath) ? + self.type == ''ReplaceFullPath'' + : true' + - message: replacePrefixMatch must be + specified when type is set to 'ReplacePrefixMatch' + rule: 'self.type == ''ReplacePrefixMatch'' + ? has(self.replacePrefixMatch) : + true' + - message: type must be 'ReplacePrefixMatch' + when replacePrefixMatch is set + rule: 'has(self.replacePrefixMatch) + ? self.type == ''ReplacePrefixMatch'' + : true' + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + scheme: + enum: + - http + - https + type: string + statusCode: + default: 302 + enum: + - 301 + - 302 + type: integer + type: object + responseHeaderModifier: + properties: + add: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + remove: + items: + type: string + maxItems: 16 + type: array + x-kubernetes-list-type: set + set: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + type: + enum: + - RequestHeaderModifier + - ResponseHeaderModifier + - RequestMirror + - RequestRedirect + - URLRewrite + - ExtensionRef + type: string + urlRewrite: + properties: + hostname: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + path: + properties: + replaceFullPath: + maxLength: 1024 + type: string + replacePrefixMatch: + maxLength: 1024 + type: string + type: + enum: + - ReplaceFullPath + - ReplacePrefixMatch + type: string + required: + - type + type: object + x-kubernetes-validations: + - message: replaceFullPath must be specified + when type is set to 'ReplaceFullPath' + rule: 'self.type == ''ReplaceFullPath'' + ? has(self.replaceFullPath) : true' + - message: type must be 'ReplaceFullPath' + when replaceFullPath is set + rule: 'has(self.replaceFullPath) ? + self.type == ''ReplaceFullPath'' + : true' + - message: replacePrefixMatch must be + specified when type is set to 'ReplacePrefixMatch' + rule: 'self.type == ''ReplacePrefixMatch'' + ? has(self.replacePrefixMatch) : + true' + - message: type must be 'ReplacePrefixMatch' + when replacePrefixMatch is set + rule: 'has(self.replacePrefixMatch) + ? self.type == ''ReplacePrefixMatch'' + : true' + type: object + required: + - type + type: object + x-kubernetes-validations: + - message: filter.requestHeaderModifier must + be nil if the filter.type is not RequestHeaderModifier + rule: '!(has(self.requestHeaderModifier) && + self.type != ''RequestHeaderModifier'')' + - message: filter.requestHeaderModifier must + be specified for RequestHeaderModifier filter.type + rule: '!(!has(self.requestHeaderModifier) + && self.type == ''RequestHeaderModifier'')' + - message: filter.responseHeaderModifier must + be nil if the filter.type is not ResponseHeaderModifier + rule: '!(has(self.responseHeaderModifier) + && self.type != ''ResponseHeaderModifier'')' + - message: filter.responseHeaderModifier must + be specified for ResponseHeaderModifier + filter.type + rule: '!(!has(self.responseHeaderModifier) + && self.type == ''ResponseHeaderModifier'')' + - message: filter.requestMirror must be nil + if the filter.type is not RequestMirror + rule: '!(has(self.requestMirror) && self.type + != ''RequestMirror'')' + - message: filter.requestMirror must be specified + for RequestMirror filter.type + rule: '!(!has(self.requestMirror) && self.type + == ''RequestMirror'')' + - message: filter.requestRedirect must be nil + if the filter.type is not RequestRedirect + rule: '!(has(self.requestRedirect) && self.type + != ''RequestRedirect'')' + - message: filter.requestRedirect must be specified + for RequestRedirect filter.type + rule: '!(!has(self.requestRedirect) && self.type + == ''RequestRedirect'')' + - message: filter.urlRewrite must be nil if + the filter.type is not URLRewrite + rule: '!(has(self.urlRewrite) && self.type + != ''URLRewrite'')' + - message: filter.urlRewrite must be specified + for URLRewrite filter.type + rule: '!(!has(self.urlRewrite) && self.type + == ''URLRewrite'')' + - message: filter.extensionRef must be nil if + the filter.type is not ExtensionRef + rule: '!(has(self.extensionRef) && self.type + != ''ExtensionRef'')' + - message: filter.extensionRef must be specified + for ExtensionRef filter.type + rule: '!(!has(self.extensionRef) && self.type + == ''ExtensionRef'')' + maxItems: 16 + type: array + x-kubernetes-validations: + - message: May specify either httpRouteFilterRequestRedirect + or httpRouteFilterRequestRewrite, but not + both + rule: '!(self.exists(f, f.type == ''RequestRedirect'') + && self.exists(f, f.type == ''URLRewrite''))' + - message: May specify either httpRouteFilterRequestRedirect + or httpRouteFilterRequestRewrite, but not + both + rule: '!(self.exists(f, f.type == ''RequestRedirect'') + && self.exists(f, f.type == ''URLRewrite''))' + - message: RequestHeaderModifier filter cannot + be repeated + rule: self.filter(f, f.type == 'RequestHeaderModifier').size() + <= 1 + - message: ResponseHeaderModifier filter cannot + be repeated + rule: self.filter(f, f.type == 'ResponseHeaderModifier').size() + <= 1 + - message: RequestRedirect filter cannot be repeated + rule: self.filter(f, f.type == 'RequestRedirect').size() + <= 1 + - message: URLRewrite filter cannot be repeated + rule: self.filter(f, f.type == 'URLRewrite').size() + <= 1 + group: + default: "" + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Service + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + weight: + default: 1 + format: int32 + maximum: 1000000 + minimum: 0 + type: integer + required: + - name + type: object + x-kubernetes-validations: + - message: Must have port for Service reference + rule: '(size(self.group) == 0 && self.kind == ''Service'') + ? has(self.port) : true' + maxItems: 16 + type: array + filters: + items: + properties: + extensionRef: + properties: + group: + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + required: + - group + - kind + - name + type: object + requestHeaderModifier: + properties: + add: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + remove: + items: + type: string + maxItems: 16 + type: array + x-kubernetes-list-type: set + set: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + requestMirror: + properties: + backendRef: + properties: + group: + default: "" + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Service + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - name + type: object + x-kubernetes-validations: + - message: Must have port for Service reference + rule: '(size(self.group) == 0 && self.kind + == ''Service'') ? has(self.port) : true' + fraction: + properties: + denominator: + default: 100 + format: int32 + minimum: 1 + type: integer + numerator: + format: int32 + minimum: 0 + type: integer + required: + - numerator + type: object + x-kubernetes-validations: + - message: numerator must be less than or + equal to denominator + rule: self.numerator <= self.denominator + percent: + format: int32 + maximum: 100 + minimum: 0 + type: integer + required: + - backendRef + type: object + requestRedirect: + properties: + hostname: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + path: + properties: + replaceFullPath: + maxLength: 1024 + type: string + replacePrefixMatch: + maxLength: 1024 + type: string + type: + enum: + - ReplaceFullPath + - ReplacePrefixMatch + type: string + required: + - type + type: object + x-kubernetes-validations: + - message: replaceFullPath must be specified + when type is set to 'ReplaceFullPath' + rule: 'self.type == ''ReplaceFullPath'' + ? has(self.replaceFullPath) : true' + - message: type must be 'ReplaceFullPath' + when replaceFullPath is set + rule: 'has(self.replaceFullPath) ? self.type + == ''ReplaceFullPath'' : true' + - message: replacePrefixMatch must be specified + when type is set to 'ReplacePrefixMatch' + rule: 'self.type == ''ReplacePrefixMatch'' + ? has(self.replacePrefixMatch) : true' + - message: type must be 'ReplacePrefixMatch' + when replacePrefixMatch is set + rule: 'has(self.replacePrefixMatch) ? self.type + == ''ReplacePrefixMatch'' : true' + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + scheme: + enum: + - http + - https + type: string + statusCode: + default: 302 + enum: + - 301 + - 302 + type: integer + type: object + responseHeaderModifier: + properties: + add: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + remove: + items: + type: string + maxItems: 16 + type: array + x-kubernetes-list-type: set + set: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + type: + enum: + - RequestHeaderModifier + - ResponseHeaderModifier + - RequestMirror + - RequestRedirect + - URLRewrite + - ExtensionRef + type: string + urlRewrite: + properties: + hostname: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + path: + properties: + replaceFullPath: + maxLength: 1024 + type: string + replacePrefixMatch: + maxLength: 1024 + type: string + type: + enum: + - ReplaceFullPath + - ReplacePrefixMatch + type: string + required: + - type + type: object + x-kubernetes-validations: + - message: replaceFullPath must be specified + when type is set to 'ReplaceFullPath' + rule: 'self.type == ''ReplaceFullPath'' + ? has(self.replaceFullPath) : true' + - message: type must be 'ReplaceFullPath' + when replaceFullPath is set + rule: 'has(self.replaceFullPath) ? self.type + == ''ReplaceFullPath'' : true' + - message: replacePrefixMatch must be specified + when type is set to 'ReplacePrefixMatch' + rule: 'self.type == ''ReplacePrefixMatch'' + ? has(self.replacePrefixMatch) : true' + - message: type must be 'ReplacePrefixMatch' + when replacePrefixMatch is set + rule: 'has(self.replacePrefixMatch) ? self.type + == ''ReplacePrefixMatch'' : true' + type: object + required: + - type + type: object + x-kubernetes-validations: + - message: filter.requestHeaderModifier must be nil + if the filter.type is not RequestHeaderModifier + rule: '!(has(self.requestHeaderModifier) && self.type + != ''RequestHeaderModifier'')' + - message: filter.requestHeaderModifier must be specified + for RequestHeaderModifier filter.type + rule: '!(!has(self.requestHeaderModifier) && self.type + == ''RequestHeaderModifier'')' + - message: filter.responseHeaderModifier must be nil + if the filter.type is not ResponseHeaderModifier + rule: '!(has(self.responseHeaderModifier) && self.type + != ''ResponseHeaderModifier'')' + - message: filter.responseHeaderModifier must be specified + for ResponseHeaderModifier filter.type + rule: '!(!has(self.responseHeaderModifier) && self.type + == ''ResponseHeaderModifier'')' + - message: filter.requestMirror must be nil if the + filter.type is not RequestMirror + rule: '!(has(self.requestMirror) && self.type != + ''RequestMirror'')' + - message: filter.requestMirror must be specified + for RequestMirror filter.type + rule: '!(!has(self.requestMirror) && self.type == + ''RequestMirror'')' + - message: filter.requestRedirect must be nil if the + filter.type is not RequestRedirect + rule: '!(has(self.requestRedirect) && self.type + != ''RequestRedirect'')' + - message: filter.requestRedirect must be specified + for RequestRedirect filter.type + rule: '!(!has(self.requestRedirect) && self.type + == ''RequestRedirect'')' + - message: filter.urlRewrite must be nil if the filter.type + is not URLRewrite + rule: '!(has(self.urlRewrite) && self.type != ''URLRewrite'')' + - message: filter.urlRewrite must be specified for + URLRewrite filter.type + rule: '!(!has(self.urlRewrite) && self.type == ''URLRewrite'')' + - message: filter.extensionRef must be nil if the + filter.type is not ExtensionRef + rule: '!(has(self.extensionRef) && self.type != + ''ExtensionRef'')' + - message: filter.extensionRef must be specified for + ExtensionRef filter.type + rule: '!(!has(self.extensionRef) && self.type == + ''ExtensionRef'')' + maxItems: 16 + type: array + x-kubernetes-validations: + - message: May specify either httpRouteFilterRequestRedirect + or httpRouteFilterRequestRewrite, but not both + rule: '!(self.exists(f, f.type == ''RequestRedirect'') + && self.exists(f, f.type == ''URLRewrite''))' + - message: RequestHeaderModifier filter cannot be repeated + rule: self.filter(f, f.type == 'RequestHeaderModifier').size() + <= 1 + - message: ResponseHeaderModifier filter cannot be repeated + rule: self.filter(f, f.type == 'ResponseHeaderModifier').size() + <= 1 + - message: RequestRedirect filter cannot be repeated + rule: self.filter(f, f.type == 'RequestRedirect').size() + <= 1 + - message: URLRewrite filter cannot be repeated + rule: self.filter(f, f.type == 'URLRewrite').size() + <= 1 + matches: + default: + - path: + type: PathPrefix + value: / + items: + properties: + headers: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + type: + default: Exact + enum: + - Exact + - RegularExpression + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + method: + enum: + - GET + - HEAD + - POST + - PUT + - DELETE + - CONNECT + - OPTIONS + - TRACE + - PATCH + type: string + path: + default: + type: PathPrefix + value: / + properties: + type: + default: PathPrefix + enum: + - Exact + - PathPrefix + - RegularExpression + type: string + value: + default: / + maxLength: 1024 + type: string + type: object + x-kubernetes-validations: + - message: value must be an absolute path and + start with '/' when type one of ['Exact', + 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? self.value.startsWith(''/'') : true' + - message: must not contain '//' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''//'') : true' + - message: must not contain '/./' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''/./'') : true' + - message: must not contain '/../' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''/../'') : true' + - message: must not contain '%2f' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''%2f'') : true' + - message: must not contain '%2F' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''%2F'') : true' + - message: must not contain '#' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''#'') : true' + - message: must not end with '/..' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.endsWith(''/..'') : true' + - message: must not end with '/.' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.endsWith(''/.'') : true' + - message: type must be one of ['Exact', 'PathPrefix', + 'RegularExpression'] + rule: self.type in ['Exact','PathPrefix'] || + self.type == 'RegularExpression' + - message: must only contain valid characters + (matching ^(?:[-A-Za-z0-9/._~!$&'()*+,;=:@]|[%][0-9a-fA-F]{2})+$) + for types ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? self.value.matches(r"""^(?:[-A-Za-z0-9/._~!$&''()*+,;=:@]|[%][0-9a-fA-F]{2})+$""") + : true' + queryParams: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + type: + default: Exact + enum: + - Exact + - RegularExpression + type: string + value: + maxLength: 1024 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + maxItems: 64 + type: array + name: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + retry: + properties: + attempts: + type: integer + backoff: + pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ + type: string + codes: + items: + maximum: 599 + minimum: 400 + type: integer + type: array + type: object + sessionPersistence: + properties: + absoluteTimeout: + pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ + type: string + cookieConfig: + properties: + lifetimeType: + default: Session + enum: + - Permanent + - Session + type: string + type: object + idleTimeout: + pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ + type: string + sessionName: + maxLength: 128 + type: string + type: + default: Cookie + enum: + - Cookie + - Header + type: string + type: object + x-kubernetes-validations: + - message: AbsoluteTimeout must be specified when cookie + lifetimeType is Permanent + rule: '!has(self.cookieConfig) || !has(self.cookieConfig.lifetimeType) + || self.cookieConfig.lifetimeType != ''Permanent'' + || has(self.absoluteTimeout)' + timeouts: + properties: + backendRequest: + pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ + type: string + request: + pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ + type: string + type: object + x-kubernetes-validations: + - message: backendRequest timeout cannot be longer than + request timeout + rule: '!(has(self.request) && has(self.backendRequest) + && duration(self.request) != duration(''0s'') && + duration(self.backendRequest) > duration(self.request))' + type: object + x-kubernetes-validations: + - message: RequestRedirect filter must not be used together + with backendRefs + rule: '(has(self.backendRefs) && size(self.backendRefs) + > 0) ? (!has(self.filters) || self.filters.all(f, !has(f.requestRedirect))): + true' + - message: When using RequestRedirect filter with path.replacePrefixMatch, + exactly one PathPrefix match must be specified + rule: '(has(self.filters) && self.filters.exists_one(f, + has(f.requestRedirect) && has(f.requestRedirect.path) + && f.requestRedirect.path.type == ''ReplacePrefixMatch'' + && has(f.requestRedirect.path.replacePrefixMatch))) + ? ((size(self.matches) != 1 || !has(self.matches[0].path) + || self.matches[0].path.type != ''PathPrefix'') ? false + : true) : true' + - message: When using URLRewrite filter with path.replacePrefixMatch, + exactly one PathPrefix match must be specified + rule: '(has(self.filters) && self.filters.exists_one(f, + has(f.urlRewrite) && has(f.urlRewrite.path) && f.urlRewrite.path.type + == ''ReplacePrefixMatch'' && has(f.urlRewrite.path.replacePrefixMatch))) + ? ((size(self.matches) != 1 || !has(self.matches[0].path) + || self.matches[0].path.type != ''PathPrefix'') ? false + : true) : true' + - message: Within backendRefs, when using RequestRedirect + filter with path.replacePrefixMatch, exactly one PathPrefix + match must be specified + rule: '(has(self.backendRefs) && self.backendRefs.exists_one(b, + (has(b.filters) && b.filters.exists_one(f, has(f.requestRedirect) + && has(f.requestRedirect.path) && f.requestRedirect.path.type + == ''ReplacePrefixMatch'' && has(f.requestRedirect.path.replacePrefixMatch))) + )) ? ((size(self.matches) != 1 || !has(self.matches[0].path) + || self.matches[0].path.type != ''PathPrefix'') ? false + : true) : true' + - message: Within backendRefs, When using URLRewrite filter + with path.replacePrefixMatch, exactly one PathPrefix + match must be specified + rule: '(has(self.backendRefs) && self.backendRefs.exists_one(b, + (has(b.filters) && b.filters.exists_one(f, has(f.urlRewrite) + && has(f.urlRewrite.path) && f.urlRewrite.path.type + == ''ReplacePrefixMatch'' && has(f.urlRewrite.path.replacePrefixMatch))) + )) ? ((size(self.matches) != 1 || !has(self.matches[0].path) + || self.matches[0].path.type != ''PathPrefix'') ? false + : true) : true' + maxItems: 16 + type: array + x-kubernetes-validations: + - message: While 16 rules and 64 matches per rule are allowed, + the total number of matches across all rules in a route + must be less than 128 + rule: '(self.size() > 0 ? self[0].matches.size() : 0) + + (self.size() > 1 ? self[1].matches.size() : 0) + (self.size() + > 2 ? self[2].matches.size() : 0) + (self.size() > 3 ? + self[3].matches.size() : 0) + (self.size() > 4 ? self[4].matches.size() + : 0) + (self.size() > 5 ? self[5].matches.size() : 0) + + (self.size() > 6 ? self[6].matches.size() : 0) + (self.size() + > 7 ? self[7].matches.size() : 0) + (self.size() > 8 ? + self[8].matches.size() : 0) + (self.size() > 9 ? self[9].matches.size() + : 0) + (self.size() > 10 ? self[10].matches.size() : 0) + + (self.size() > 11 ? self[11].matches.size() : 0) + (self.size() + > 12 ? self[12].matches.size() : 0) + (self.size() > 13 + ? self[13].matches.size() : 0) + (self.size() > 14 ? self[14].matches.size() + : 0) + (self.size() > 15 ? self[15].matches.size() : 0) + <= 128' + type: object + status: + properties: + parents: + items: + properties: + conditions: + items: + properties: + lastTransitionTime: + format: date-time + type: string + message: + maxLength: 32768 + type: string + observedGeneration: + format: int64 + minimum: 0 + type: integer + reason: + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + enum: + - "True" + - "False" + - Unknown + type: string + type: + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + minItems: 1 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + controllerName: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ + type: string + parentRef: + properties: + group: + default: gateway.networking.k8s.io + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Gateway + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + sectionName: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + required: + - name + type: object + required: + - controllerName + - parentRef + type: object + maxItems: 32 + type: array + required: + - parents + type: object + required: + - spec + type: object rayClusterConfig: properties: autoscalerOptions: @@ -8241,6 +10000,25 @@ spec: type: integer upgradeStrategy: properties: + incrementalUpgradeOptions: + properties: + gatewayClassName: + type: string + intervalSeconds: + format: int32 + type: integer + maxSurgePercent: + default: 100 + format: int32 + type: integer + stepSizePercent: + format: int32 + type: integer + required: + - gatewayClassName + - intervalSeconds + - stepSizePercent + type: object type: type: string type: object @@ -8269,6 +10047,9 @@ spec: type: string type: object type: object + lastTrafficMigratedTime: + format: date-time + type: string rayClusterName: type: string rayClusterStatus: @@ -8383,6 +10164,12 @@ spec: type: string type: object type: object + targetCapacity: + format: int32 + type: integer + trafficRoutedPercent: + format: int32 + type: integer type: object conditions: items: @@ -8420,9 +10207,6 @@ spec: - type type: object type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map lastUpdateTime: format: date-time type: string @@ -8452,6 +10236,9 @@ spec: type: string type: object type: object + lastTrafficMigratedTime: + format: date-time + type: string rayClusterName: type: string rayClusterStatus: @@ -8566,6 +10353,12 @@ spec: type: string type: object type: object + targetCapacity: + format: int32 + type: integer + trafficRoutedPercent: + format: int32 + type: integer type: object serviceStatus: type: string diff --git a/ray-operator/Makefile b/ray-operator/Makefile index faab31894b5..83a2b6f5c4e 100644 --- a/ray-operator/Makefile +++ b/ray-operator/Makefile @@ -75,6 +75,10 @@ test-e2e-autoscaler: WHAT ?= ./test/e2eautoscaler test-e2e-autoscaler: manifests fmt vet ## Run e2e autoscaler tests. go test -timeout 30m -v $(WHAT) +test-e2e-rayservice: WHAT ?= ./test/e2erayservice +test-e2e-rayservice: manifests fmt vet ## Run e2e RayService tests. + go test -timeout 30m -v $(WHAT) + test-e2e-upgrade: WHAT ?= ./test/e2eupgrade test-e2e-upgrade: manifests fmt vet ## Run e2e tests. go test -timeout 30m -v $(WHAT) diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index e7d73e07d8e..224b7096be8 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -3,6 +3,7 @@ package v1 import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" ) // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! @@ -22,6 +23,9 @@ const ( type RayServiceUpgradeType string const ( + // During upgrade, IncrementalUpgrade strategy will create an upgraded cluster to gradually scale + // and migrate traffic to using Gateway API. + IncrementalUpgrade RayServiceUpgradeType = "IncrementalUpgrade" // During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready NewCluster RayServiceUpgradeType = "NewCluster" // No new cluster will be created while the strategy is set to None @@ -57,10 +61,25 @@ var DeploymentStatusEnum = struct { UNHEALTHY: "UNHEALTHY", } +type IncrementalUpgradeOptions struct { + // The capacity of serve requests the upgraded cluster should scale to handle each interval. + // Defaults to 100%. + // +kubebuilder:default:=100 + MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"` + // The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent. + StepSizePercent *int32 `json:"stepSizePercent"` + // The interval in seconds between transferring StepSize traffic from the old to new RayCluster. + IntervalSeconds *int32 `json:"intervalSeconds"` + // The name of the Gateway Class installed by the Kubernetes Cluster admin. + GatewayClassName string `json:"gatewayClassName"` +} + type RayServiceUpgradeStrategy struct { // Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. // +optional Type *RayServiceUpgradeType `json:"type,omitempty"` + // IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade. + IncrementalUpgradeOptions *IncrementalUpgradeOptions `json:"incrementalUpgradeOptions,omitempty"` } // RayServiceSpec defines the desired state of RayService @@ -79,6 +98,10 @@ type RayServiceSpec struct { // ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. // +optional ServeService *corev1.Service `json:"serveService,omitempty"` + // Gateway is the Gateway object for the RayService to serve traffics during an IncrementalUpgrade. + Gateway *gwv1.Gateway `json:"gateway,omitempty"` + // HTTPRoute is the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade. + HTTPRoute *gwv1.HTTPRoute `json:"httpRoute,omitempty"` // UpgradeStrategy defines the scaling policy used when upgrading the RayService. // +optional UpgradeStrategy *RayServiceUpgradeStrategy `json:"upgradeStrategy,omitempty"` @@ -95,44 +118,22 @@ type RayServiceSpec struct { // RayServiceStatuses defines the observed state of RayService type RayServiceStatuses struct { - // Represents the latest available observations of a RayService's current state. - // +patchMergeKey=type - // +patchStrategy=merge - // +listType=map - // +listMapKey=type - // +optional - Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` - // LastUpdateTime represents the timestamp when the RayService status was last updated. - // +optional - LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"` - // Deprecated: `ServiceStatus` is deprecated - use `Conditions` instead. `Running` means the RayService is ready to - // serve requests. An empty `ServiceStatus` means the RayService is not ready to serve requests. The definition of - // `ServiceStatus` is equivalent to the `RayServiceReady` condition. - // +optional - ServiceStatus ServiceStatus `json:"serviceStatus,omitempty"` - // +optional - ActiveServiceStatus RayServiceStatus `json:"activeServiceStatus,omitempty"` - // Pending Service Status indicates a RayCluster will be created or is being created. - // +optional - PendingServiceStatus RayServiceStatus `json:"pendingServiceStatus,omitempty"` - // NumServeEndpoints indicates the number of Ray Pods that are actively serving or have been selected by the serve service. - // Ray Pods without a proxy actor or those that are unhealthy will not be counted. - // +optional - NumServeEndpoints int32 `json:"numServeEndpoints,omitempty"` - // observedGeneration is the most recent generation observed for this RayService. It corresponds to the - // RayService's generation, which is updated on mutation by the API Server. - // +optional - ObservedGeneration int64 `json:"observedGeneration,omitempty"` + LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"` + ServiceStatus ServiceStatus `json:"serviceStatus,omitempty"` + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` + ActiveServiceStatus RayServiceStatus `json:"activeServiceStatus,omitempty"` + PendingServiceStatus RayServiceStatus `json:"pendingServiceStatus,omitempty"` + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + NumServeEndpoints int32 `json:"numServeEndpoints,omitempty"` } type RayServiceStatus struct { - // Important: Run "make" to regenerate code after modifying this file - // +optional - Applications map[string]AppStatus `json:"applicationStatuses,omitempty"` - // +optional - RayClusterName string `json:"rayClusterName,omitempty"` - // +optional - RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"` + Applications map[string]AppStatus `json:"applicationStatuses,omitempty"` + TargetCapacity *int32 `json:"targetCapacity,omitempty"` + TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"` + LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"` + RayClusterName string `json:"rayClusterName,omitempty"` + RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"` } type AppStatus struct { @@ -184,10 +185,8 @@ const ( type RayService struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - - Spec RayServiceSpec `json:"spec,omitempty"` - // +optional - Status RayServiceStatuses `json:"status,omitempty"` + Spec RayServiceSpec `json:"spec,omitempty"` + Status RayServiceStatuses `json:"status,omitempty"` } //+kubebuilder:object:root=true diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index 5a6ce86bc10..8200a01f43f 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -8,6 +8,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" + apisv1 "sigs.k8s.io/gateway-api/apis/v1" ) // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. @@ -271,6 +272,36 @@ func (in *HeadInfo) DeepCopy() *HeadInfo { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *IncrementalUpgradeOptions) DeepCopyInto(out *IncrementalUpgradeOptions) { + *out = *in + if in.MaxSurgePercent != nil { + in, out := &in.MaxSurgePercent, &out.MaxSurgePercent + *out = new(int32) + **out = **in + } + if in.StepSizePercent != nil { + in, out := &in.StepSizePercent, &out.StepSizePercent + *out = new(int32) + **out = **in + } + if in.IntervalSeconds != nil { + in, out := &in.IntervalSeconds, &out.IntervalSeconds + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IncrementalUpgradeOptions. +func (in *IncrementalUpgradeOptions) DeepCopy() *IncrementalUpgradeOptions { + if in == nil { + return nil + } + out := new(IncrementalUpgradeOptions) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RayCluster) DeepCopyInto(out *RayCluster) { *out = *in @@ -693,6 +724,16 @@ func (in *RayServiceSpec) DeepCopyInto(out *RayServiceSpec) { *out = new(corev1.Service) (*in).DeepCopyInto(*out) } + if in.Gateway != nil { + in, out := &in.Gateway, &out.Gateway + *out = new(apisv1.Gateway) + (*in).DeepCopyInto(*out) + } + if in.HTTPRoute != nil { + in, out := &in.HTTPRoute, &out.HTTPRoute + *out = new(apisv1.HTTPRoute) + (*in).DeepCopyInto(*out) + } if in.UpgradeStrategy != nil { in, out := &in.UpgradeStrategy, &out.UpgradeStrategy *out = new(RayServiceUpgradeStrategy) @@ -721,6 +762,20 @@ func (in *RayServiceStatus) DeepCopyInto(out *RayServiceStatus) { (*out)[key] = *val.DeepCopy() } } + if in.TargetCapacity != nil { + in, out := &in.TargetCapacity, &out.TargetCapacity + *out = new(int32) + **out = **in + } + if in.TrafficRoutedPercent != nil { + in, out := &in.TrafficRoutedPercent, &out.TrafficRoutedPercent + *out = new(int32) + **out = **in + } + if in.LastTrafficMigratedTime != nil { + in, out := &in.LastTrafficMigratedTime, &out.LastTrafficMigratedTime + *out = (*in).DeepCopy() + } in.RayClusterStatus.DeepCopyInto(&out.RayClusterStatus) } @@ -737,6 +792,10 @@ func (in *RayServiceStatus) DeepCopy() *RayServiceStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RayServiceStatuses) DeepCopyInto(out *RayServiceStatuses) { *out = *in + if in.LastUpdateTime != nil { + in, out := &in.LastUpdateTime, &out.LastUpdateTime + *out = (*in).DeepCopy() + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) @@ -744,10 +803,6 @@ func (in *RayServiceStatuses) DeepCopyInto(out *RayServiceStatuses) { (*in)[i].DeepCopyInto(&(*out)[i]) } } - if in.LastUpdateTime != nil { - in, out := &in.LastUpdateTime, &out.LastUpdateTime - *out = (*in).DeepCopy() - } in.ActiveServiceStatus.DeepCopyInto(&out.ActiveServiceStatus) in.PendingServiceStatus.DeepCopyInto(&out.PendingServiceStatus) } @@ -770,6 +825,11 @@ func (in *RayServiceUpgradeStrategy) DeepCopyInto(out *RayServiceUpgradeStrategy *out = new(RayServiceUpgradeType) **out = **in } + if in.IncrementalUpgradeOptions != nil { + in, out := &in.IncrementalUpgradeOptions, &out.IncrementalUpgradeOptions + *out = new(IncrementalUpgradeOptions) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RayServiceUpgradeStrategy. diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index e2d61172a3c..02e449d4726 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -40,6 +40,1765 @@ spec: type: integer excludeHeadPodFromServeSvc: type: boolean + gateway: + properties: + apiVersion: + type: string + kind: + type: string + metadata: + properties: + annotations: + additionalProperties: + type: string + type: object + finalizers: + items: + type: string + type: array + labels: + additionalProperties: + type: string + type: object + name: + type: string + namespace: + type: string + type: object + spec: + properties: + addresses: + items: + properties: + type: + default: IPAddress + maxLength: 253 + minLength: 1 + pattern: ^Hostname|IPAddress|NamedAddress|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ + type: string + value: + maxLength: 253 + minLength: 1 + type: string + required: + - value + type: object + x-kubernetes-validations: + - message: Hostname value must only contain valid characters + (matching ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$) + rule: 'self.type == ''Hostname'' ? self.value.matches(r"""^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"""): + true' + maxItems: 16 + type: array + x-kubernetes-validations: + - message: IPAddress values must be unique + rule: 'self.all(a1, a1.type == ''IPAddress'' ? self.exists_one(a2, + a2.type == a1.type && a2.value == a1.value) : true )' + - message: Hostname values must be unique + rule: 'self.all(a1, a1.type == ''Hostname'' ? self.exists_one(a2, + a2.type == a1.type && a2.value == a1.value) : true )' + backendTLS: + properties: + clientCertificateRef: + properties: + group: + default: "" + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Secret + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + required: + - name + type: object + type: object + gatewayClassName: + maxLength: 253 + minLength: 1 + type: string + infrastructure: + properties: + annotations: + additionalProperties: + maxLength: 4096 + minLength: 0 + type: string + maxProperties: 8 + type: object + x-kubernetes-validations: + - message: Annotation keys must be in the form of an optional + DNS subdomain prefix followed by a required name segment + of up to 63 characters. + rule: self.all(key, key.matches(r"""^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$""")) + - message: If specified, the annotation key's prefix must + be a DNS subdomain not longer than 253 characters + in total. + rule: self.all(key, key.split("/")[0].size() < 253) + labels: + additionalProperties: + maxLength: 63 + minLength: 0 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + type: string + maxProperties: 8 + type: object + x-kubernetes-validations: + - message: Label keys must be in the form of an optional + DNS subdomain prefix followed by a required name segment + of up to 63 characters. + rule: self.all(key, key.matches(r"""^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$""")) + - message: If specified, the label key's prefix must be + a DNS subdomain not longer than 253 characters in + total. + rule: self.all(key, key.split("/")[0].size() < 253) + parametersRef: + properties: + group: + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + required: + - group + - kind + - name + type: object + type: object + listeners: + items: + properties: + allowedRoutes: + default: + namespaces: + from: Same + properties: + kinds: + items: + properties: + group: + default: gateway.networking.k8s.io + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + required: + - kind + type: object + maxItems: 8 + type: array + namespaces: + default: + from: Same + properties: + from: + default: Same + enum: + - All + - Selector + - Same + type: string + selector: + properties: + matchExpressions: + items: + properties: + key: + type: string + operator: + type: string + values: + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + type: object + type: object + x-kubernetes-map-type: atomic + type: object + type: object + hostname: + maxLength: 253 + minLength: 1 + pattern: ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + name: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + protocol: + maxLength: 255 + minLength: 1 + pattern: ^[a-zA-Z0-9]([-a-zA-Z0-9]*[a-zA-Z0-9])?$|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9]+$ + type: string + tls: + properties: + certificateRefs: + items: + properties: + group: + default: "" + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Secret + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + required: + - name + type: object + maxItems: 64 + type: array + frontendValidation: + properties: + caCertificateRefs: + items: + properties: + group: + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + required: + - group + - kind + - name + type: object + maxItems: 8 + minItems: 1 + type: array + type: object + mode: + default: Terminate + enum: + - Terminate + - Passthrough + type: string + options: + additionalProperties: + maxLength: 4096 + minLength: 0 + type: string + maxProperties: 16 + type: object + type: object + x-kubernetes-validations: + - message: certificateRefs or options must be specified + when mode is Terminate + rule: 'self.mode == ''Terminate'' ? size(self.certificateRefs) + > 0 || size(self.options) > 0 : true' + required: + - name + - port + - protocol + type: object + maxItems: 64 + minItems: 1 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + x-kubernetes-validations: + - message: tls must not be specified for protocols ['HTTP', + 'TCP', 'UDP'] + rule: 'self.all(l, l.protocol in [''HTTP'', ''TCP'', ''UDP''] + ? !has(l.tls) : true)' + - message: tls mode must be Terminate for protocol HTTPS + rule: 'self.all(l, (l.protocol == ''HTTPS'' && has(l.tls)) + ? (l.tls.mode == '''' || l.tls.mode == ''Terminate'') + : true)' + - message: hostname must not be specified for protocols ['TCP', + 'UDP'] + rule: 'self.all(l, l.protocol in [''TCP'', ''UDP''] ? (!has(l.hostname) + || l.hostname == '''') : true)' + - message: Listener name must be unique within the Gateway + rule: self.all(l1, self.exists_one(l2, l1.name == l2.name)) + - message: Combination of port, protocol and hostname must + be unique for each listener + rule: 'self.all(l1, self.exists_one(l2, l1.port == l2.port + && l1.protocol == l2.protocol && (has(l1.hostname) && + has(l2.hostname) ? l1.hostname == l2.hostname : !has(l1.hostname) + && !has(l2.hostname))))' + required: + - gatewayClassName + - listeners + type: object + status: + default: + conditions: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Accepted + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Programmed + properties: + addresses: + items: + properties: + type: + default: IPAddress + maxLength: 253 + minLength: 1 + pattern: ^Hostname|IPAddress|NamedAddress|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ + type: string + value: + maxLength: 253 + minLength: 1 + type: string + required: + - value + type: object + x-kubernetes-validations: + - message: Hostname value must only contain valid characters + (matching ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$) + rule: 'self.type == ''Hostname'' ? self.value.matches(r"""^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"""): + true' + maxItems: 16 + type: array + conditions: + default: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Accepted + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Programmed + items: + properties: + lastTransitionTime: + format: date-time + type: string + message: + maxLength: 32768 + type: string + observedGeneration: + format: int64 + minimum: 0 + type: integer + reason: + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + enum: + - "True" + - "False" + - Unknown + type: string + type: + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + listeners: + items: + properties: + attachedRoutes: + format: int32 + type: integer + conditions: + items: + properties: + lastTransitionTime: + format: date-time + type: string + message: + maxLength: 32768 + type: string + observedGeneration: + format: int64 + minimum: 0 + type: integer + reason: + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + enum: + - "True" + - "False" + - Unknown + type: string + type: + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + name: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + supportedKinds: + items: + properties: + group: + default: gateway.networking.k8s.io + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + required: + - kind + type: object + maxItems: 8 + type: array + required: + - attachedRoutes + - conditions + - name + - supportedKinds + type: object + maxItems: 64 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + httpRoute: + properties: + apiVersion: + type: string + kind: + type: string + metadata: + properties: + annotations: + additionalProperties: + type: string + type: object + finalizers: + items: + type: string + type: array + labels: + additionalProperties: + type: string + type: object + name: + type: string + namespace: + type: string + type: object + spec: + properties: + hostnames: + items: + maxLength: 253 + minLength: 1 + pattern: ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + maxItems: 16 + type: array + parentRefs: + items: + properties: + group: + default: gateway.networking.k8s.io + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Gateway + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + sectionName: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + required: + - name + type: object + maxItems: 32 + type: array + rules: + default: + - matches: + - path: + type: PathPrefix + value: / + items: + properties: + backendRefs: + items: + properties: + filters: + items: + properties: + extensionRef: + properties: + group: + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + required: + - group + - kind + - name + type: object + requestHeaderModifier: + properties: + add: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + remove: + items: + type: string + maxItems: 16 + type: array + x-kubernetes-list-type: set + set: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + requestMirror: + properties: + backendRef: + properties: + group: + default: "" + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Service + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - name + type: object + x-kubernetes-validations: + - message: Must have port for Service + reference + rule: '(size(self.group) == 0 && self.kind + == ''Service'') ? has(self.port) + : true' + fraction: + properties: + denominator: + default: 100 + format: int32 + minimum: 1 + type: integer + numerator: + format: int32 + minimum: 0 + type: integer + required: + - numerator + type: object + x-kubernetes-validations: + - message: numerator must be less than + or equal to denominator + rule: self.numerator <= self.denominator + percent: + format: int32 + maximum: 100 + minimum: 0 + type: integer + required: + - backendRef + type: object + requestRedirect: + properties: + hostname: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + path: + properties: + replaceFullPath: + maxLength: 1024 + type: string + replacePrefixMatch: + maxLength: 1024 + type: string + type: + enum: + - ReplaceFullPath + - ReplacePrefixMatch + type: string + required: + - type + type: object + x-kubernetes-validations: + - message: replaceFullPath must be specified + when type is set to 'ReplaceFullPath' + rule: 'self.type == ''ReplaceFullPath'' + ? has(self.replaceFullPath) : true' + - message: type must be 'ReplaceFullPath' + when replaceFullPath is set + rule: 'has(self.replaceFullPath) ? + self.type == ''ReplaceFullPath'' + : true' + - message: replacePrefixMatch must be + specified when type is set to 'ReplacePrefixMatch' + rule: 'self.type == ''ReplacePrefixMatch'' + ? has(self.replacePrefixMatch) : + true' + - message: type must be 'ReplacePrefixMatch' + when replacePrefixMatch is set + rule: 'has(self.replacePrefixMatch) + ? self.type == ''ReplacePrefixMatch'' + : true' + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + scheme: + enum: + - http + - https + type: string + statusCode: + default: 302 + enum: + - 301 + - 302 + type: integer + type: object + responseHeaderModifier: + properties: + add: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + remove: + items: + type: string + maxItems: 16 + type: array + x-kubernetes-list-type: set + set: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + type: + enum: + - RequestHeaderModifier + - ResponseHeaderModifier + - RequestMirror + - RequestRedirect + - URLRewrite + - ExtensionRef + type: string + urlRewrite: + properties: + hostname: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + path: + properties: + replaceFullPath: + maxLength: 1024 + type: string + replacePrefixMatch: + maxLength: 1024 + type: string + type: + enum: + - ReplaceFullPath + - ReplacePrefixMatch + type: string + required: + - type + type: object + x-kubernetes-validations: + - message: replaceFullPath must be specified + when type is set to 'ReplaceFullPath' + rule: 'self.type == ''ReplaceFullPath'' + ? has(self.replaceFullPath) : true' + - message: type must be 'ReplaceFullPath' + when replaceFullPath is set + rule: 'has(self.replaceFullPath) ? + self.type == ''ReplaceFullPath'' + : true' + - message: replacePrefixMatch must be + specified when type is set to 'ReplacePrefixMatch' + rule: 'self.type == ''ReplacePrefixMatch'' + ? has(self.replacePrefixMatch) : + true' + - message: type must be 'ReplacePrefixMatch' + when replacePrefixMatch is set + rule: 'has(self.replacePrefixMatch) + ? self.type == ''ReplacePrefixMatch'' + : true' + type: object + required: + - type + type: object + x-kubernetes-validations: + - message: filter.requestHeaderModifier must + be nil if the filter.type is not RequestHeaderModifier + rule: '!(has(self.requestHeaderModifier) && + self.type != ''RequestHeaderModifier'')' + - message: filter.requestHeaderModifier must + be specified for RequestHeaderModifier filter.type + rule: '!(!has(self.requestHeaderModifier) + && self.type == ''RequestHeaderModifier'')' + - message: filter.responseHeaderModifier must + be nil if the filter.type is not ResponseHeaderModifier + rule: '!(has(self.responseHeaderModifier) + && self.type != ''ResponseHeaderModifier'')' + - message: filter.responseHeaderModifier must + be specified for ResponseHeaderModifier + filter.type + rule: '!(!has(self.responseHeaderModifier) + && self.type == ''ResponseHeaderModifier'')' + - message: filter.requestMirror must be nil + if the filter.type is not RequestMirror + rule: '!(has(self.requestMirror) && self.type + != ''RequestMirror'')' + - message: filter.requestMirror must be specified + for RequestMirror filter.type + rule: '!(!has(self.requestMirror) && self.type + == ''RequestMirror'')' + - message: filter.requestRedirect must be nil + if the filter.type is not RequestRedirect + rule: '!(has(self.requestRedirect) && self.type + != ''RequestRedirect'')' + - message: filter.requestRedirect must be specified + for RequestRedirect filter.type + rule: '!(!has(self.requestRedirect) && self.type + == ''RequestRedirect'')' + - message: filter.urlRewrite must be nil if + the filter.type is not URLRewrite + rule: '!(has(self.urlRewrite) && self.type + != ''URLRewrite'')' + - message: filter.urlRewrite must be specified + for URLRewrite filter.type + rule: '!(!has(self.urlRewrite) && self.type + == ''URLRewrite'')' + - message: filter.extensionRef must be nil if + the filter.type is not ExtensionRef + rule: '!(has(self.extensionRef) && self.type + != ''ExtensionRef'')' + - message: filter.extensionRef must be specified + for ExtensionRef filter.type + rule: '!(!has(self.extensionRef) && self.type + == ''ExtensionRef'')' + maxItems: 16 + type: array + x-kubernetes-validations: + - message: May specify either httpRouteFilterRequestRedirect + or httpRouteFilterRequestRewrite, but not + both + rule: '!(self.exists(f, f.type == ''RequestRedirect'') + && self.exists(f, f.type == ''URLRewrite''))' + - message: May specify either httpRouteFilterRequestRedirect + or httpRouteFilterRequestRewrite, but not + both + rule: '!(self.exists(f, f.type == ''RequestRedirect'') + && self.exists(f, f.type == ''URLRewrite''))' + - message: RequestHeaderModifier filter cannot + be repeated + rule: self.filter(f, f.type == 'RequestHeaderModifier').size() + <= 1 + - message: ResponseHeaderModifier filter cannot + be repeated + rule: self.filter(f, f.type == 'ResponseHeaderModifier').size() + <= 1 + - message: RequestRedirect filter cannot be repeated + rule: self.filter(f, f.type == 'RequestRedirect').size() + <= 1 + - message: URLRewrite filter cannot be repeated + rule: self.filter(f, f.type == 'URLRewrite').size() + <= 1 + group: + default: "" + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Service + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + weight: + default: 1 + format: int32 + maximum: 1000000 + minimum: 0 + type: integer + required: + - name + type: object + x-kubernetes-validations: + - message: Must have port for Service reference + rule: '(size(self.group) == 0 && self.kind == ''Service'') + ? has(self.port) : true' + maxItems: 16 + type: array + filters: + items: + properties: + extensionRef: + properties: + group: + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + required: + - group + - kind + - name + type: object + requestHeaderModifier: + properties: + add: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + remove: + items: + type: string + maxItems: 16 + type: array + x-kubernetes-list-type: set + set: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + requestMirror: + properties: + backendRef: + properties: + group: + default: "" + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Service + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - name + type: object + x-kubernetes-validations: + - message: Must have port for Service reference + rule: '(size(self.group) == 0 && self.kind + == ''Service'') ? has(self.port) : true' + fraction: + properties: + denominator: + default: 100 + format: int32 + minimum: 1 + type: integer + numerator: + format: int32 + minimum: 0 + type: integer + required: + - numerator + type: object + x-kubernetes-validations: + - message: numerator must be less than or + equal to denominator + rule: self.numerator <= self.denominator + percent: + format: int32 + maximum: 100 + minimum: 0 + type: integer + required: + - backendRef + type: object + requestRedirect: + properties: + hostname: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + path: + properties: + replaceFullPath: + maxLength: 1024 + type: string + replacePrefixMatch: + maxLength: 1024 + type: string + type: + enum: + - ReplaceFullPath + - ReplacePrefixMatch + type: string + required: + - type + type: object + x-kubernetes-validations: + - message: replaceFullPath must be specified + when type is set to 'ReplaceFullPath' + rule: 'self.type == ''ReplaceFullPath'' + ? has(self.replaceFullPath) : true' + - message: type must be 'ReplaceFullPath' + when replaceFullPath is set + rule: 'has(self.replaceFullPath) ? self.type + == ''ReplaceFullPath'' : true' + - message: replacePrefixMatch must be specified + when type is set to 'ReplacePrefixMatch' + rule: 'self.type == ''ReplacePrefixMatch'' + ? has(self.replacePrefixMatch) : true' + - message: type must be 'ReplacePrefixMatch' + when replacePrefixMatch is set + rule: 'has(self.replacePrefixMatch) ? self.type + == ''ReplacePrefixMatch'' : true' + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + scheme: + enum: + - http + - https + type: string + statusCode: + default: 302 + enum: + - 301 + - 302 + type: integer + type: object + responseHeaderModifier: + properties: + add: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + remove: + items: + type: string + maxItems: 16 + type: array + x-kubernetes-list-type: set + set: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + type: + enum: + - RequestHeaderModifier + - ResponseHeaderModifier + - RequestMirror + - RequestRedirect + - URLRewrite + - ExtensionRef + type: string + urlRewrite: + properties: + hostname: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + path: + properties: + replaceFullPath: + maxLength: 1024 + type: string + replacePrefixMatch: + maxLength: 1024 + type: string + type: + enum: + - ReplaceFullPath + - ReplacePrefixMatch + type: string + required: + - type + type: object + x-kubernetes-validations: + - message: replaceFullPath must be specified + when type is set to 'ReplaceFullPath' + rule: 'self.type == ''ReplaceFullPath'' + ? has(self.replaceFullPath) : true' + - message: type must be 'ReplaceFullPath' + when replaceFullPath is set + rule: 'has(self.replaceFullPath) ? self.type + == ''ReplaceFullPath'' : true' + - message: replacePrefixMatch must be specified + when type is set to 'ReplacePrefixMatch' + rule: 'self.type == ''ReplacePrefixMatch'' + ? has(self.replacePrefixMatch) : true' + - message: type must be 'ReplacePrefixMatch' + when replacePrefixMatch is set + rule: 'has(self.replacePrefixMatch) ? self.type + == ''ReplacePrefixMatch'' : true' + type: object + required: + - type + type: object + x-kubernetes-validations: + - message: filter.requestHeaderModifier must be nil + if the filter.type is not RequestHeaderModifier + rule: '!(has(self.requestHeaderModifier) && self.type + != ''RequestHeaderModifier'')' + - message: filter.requestHeaderModifier must be specified + for RequestHeaderModifier filter.type + rule: '!(!has(self.requestHeaderModifier) && self.type + == ''RequestHeaderModifier'')' + - message: filter.responseHeaderModifier must be nil + if the filter.type is not ResponseHeaderModifier + rule: '!(has(self.responseHeaderModifier) && self.type + != ''ResponseHeaderModifier'')' + - message: filter.responseHeaderModifier must be specified + for ResponseHeaderModifier filter.type + rule: '!(!has(self.responseHeaderModifier) && self.type + == ''ResponseHeaderModifier'')' + - message: filter.requestMirror must be nil if the + filter.type is not RequestMirror + rule: '!(has(self.requestMirror) && self.type != + ''RequestMirror'')' + - message: filter.requestMirror must be specified + for RequestMirror filter.type + rule: '!(!has(self.requestMirror) && self.type == + ''RequestMirror'')' + - message: filter.requestRedirect must be nil if the + filter.type is not RequestRedirect + rule: '!(has(self.requestRedirect) && self.type + != ''RequestRedirect'')' + - message: filter.requestRedirect must be specified + for RequestRedirect filter.type + rule: '!(!has(self.requestRedirect) && self.type + == ''RequestRedirect'')' + - message: filter.urlRewrite must be nil if the filter.type + is not URLRewrite + rule: '!(has(self.urlRewrite) && self.type != ''URLRewrite'')' + - message: filter.urlRewrite must be specified for + URLRewrite filter.type + rule: '!(!has(self.urlRewrite) && self.type == ''URLRewrite'')' + - message: filter.extensionRef must be nil if the + filter.type is not ExtensionRef + rule: '!(has(self.extensionRef) && self.type != + ''ExtensionRef'')' + - message: filter.extensionRef must be specified for + ExtensionRef filter.type + rule: '!(!has(self.extensionRef) && self.type == + ''ExtensionRef'')' + maxItems: 16 + type: array + x-kubernetes-validations: + - message: May specify either httpRouteFilterRequestRedirect + or httpRouteFilterRequestRewrite, but not both + rule: '!(self.exists(f, f.type == ''RequestRedirect'') + && self.exists(f, f.type == ''URLRewrite''))' + - message: RequestHeaderModifier filter cannot be repeated + rule: self.filter(f, f.type == 'RequestHeaderModifier').size() + <= 1 + - message: ResponseHeaderModifier filter cannot be repeated + rule: self.filter(f, f.type == 'ResponseHeaderModifier').size() + <= 1 + - message: RequestRedirect filter cannot be repeated + rule: self.filter(f, f.type == 'RequestRedirect').size() + <= 1 + - message: URLRewrite filter cannot be repeated + rule: self.filter(f, f.type == 'URLRewrite').size() + <= 1 + matches: + default: + - path: + type: PathPrefix + value: / + items: + properties: + headers: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + type: + default: Exact + enum: + - Exact + - RegularExpression + type: string + value: + maxLength: 4096 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + method: + enum: + - GET + - HEAD + - POST + - PUT + - DELETE + - CONNECT + - OPTIONS + - TRACE + - PATCH + type: string + path: + default: + type: PathPrefix + value: / + properties: + type: + default: PathPrefix + enum: + - Exact + - PathPrefix + - RegularExpression + type: string + value: + default: / + maxLength: 1024 + type: string + type: object + x-kubernetes-validations: + - message: value must be an absolute path and + start with '/' when type one of ['Exact', + 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? self.value.startsWith(''/'') : true' + - message: must not contain '//' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''//'') : true' + - message: must not contain '/./' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''/./'') : true' + - message: must not contain '/../' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''/../'') : true' + - message: must not contain '%2f' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''%2f'') : true' + - message: must not contain '%2F' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''%2F'') : true' + - message: must not contain '#' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.contains(''#'') : true' + - message: must not end with '/..' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.endsWith(''/..'') : true' + - message: must not end with '/.' when type one + of ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? !self.value.endsWith(''/.'') : true' + - message: type must be one of ['Exact', 'PathPrefix', + 'RegularExpression'] + rule: self.type in ['Exact','PathPrefix'] || + self.type == 'RegularExpression' + - message: must only contain valid characters + (matching ^(?:[-A-Za-z0-9/._~!$&'()*+,;=:@]|[%][0-9a-fA-F]{2})+$) + for types ['Exact', 'PathPrefix'] + rule: '(self.type in [''Exact'',''PathPrefix'']) + ? self.value.matches(r"""^(?:[-A-Za-z0-9/._~!$&''()*+,;=:@]|[%][0-9a-fA-F]{2})+$""") + : true' + queryParams: + items: + properties: + name: + maxLength: 256 + minLength: 1 + pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ + type: string + type: + default: Exact + enum: + - Exact + - RegularExpression + type: string + value: + maxLength: 1024 + minLength: 1 + type: string + required: + - name + - value + type: object + maxItems: 16 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + maxItems: 64 + type: array + name: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + retry: + properties: + attempts: + type: integer + backoff: + pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ + type: string + codes: + items: + maximum: 599 + minimum: 400 + type: integer + type: array + type: object + sessionPersistence: + properties: + absoluteTimeout: + pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ + type: string + cookieConfig: + properties: + lifetimeType: + default: Session + enum: + - Permanent + - Session + type: string + type: object + idleTimeout: + pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ + type: string + sessionName: + maxLength: 128 + type: string + type: + default: Cookie + enum: + - Cookie + - Header + type: string + type: object + x-kubernetes-validations: + - message: AbsoluteTimeout must be specified when cookie + lifetimeType is Permanent + rule: '!has(self.cookieConfig) || !has(self.cookieConfig.lifetimeType) + || self.cookieConfig.lifetimeType != ''Permanent'' + || has(self.absoluteTimeout)' + timeouts: + properties: + backendRequest: + pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ + type: string + request: + pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ + type: string + type: object + x-kubernetes-validations: + - message: backendRequest timeout cannot be longer than + request timeout + rule: '!(has(self.request) && has(self.backendRequest) + && duration(self.request) != duration(''0s'') && + duration(self.backendRequest) > duration(self.request))' + type: object + x-kubernetes-validations: + - message: RequestRedirect filter must not be used together + with backendRefs + rule: '(has(self.backendRefs) && size(self.backendRefs) + > 0) ? (!has(self.filters) || self.filters.all(f, !has(f.requestRedirect))): + true' + - message: When using RequestRedirect filter with path.replacePrefixMatch, + exactly one PathPrefix match must be specified + rule: '(has(self.filters) && self.filters.exists_one(f, + has(f.requestRedirect) && has(f.requestRedirect.path) + && f.requestRedirect.path.type == ''ReplacePrefixMatch'' + && has(f.requestRedirect.path.replacePrefixMatch))) + ? ((size(self.matches) != 1 || !has(self.matches[0].path) + || self.matches[0].path.type != ''PathPrefix'') ? false + : true) : true' + - message: When using URLRewrite filter with path.replacePrefixMatch, + exactly one PathPrefix match must be specified + rule: '(has(self.filters) && self.filters.exists_one(f, + has(f.urlRewrite) && has(f.urlRewrite.path) && f.urlRewrite.path.type + == ''ReplacePrefixMatch'' && has(f.urlRewrite.path.replacePrefixMatch))) + ? ((size(self.matches) != 1 || !has(self.matches[0].path) + || self.matches[0].path.type != ''PathPrefix'') ? false + : true) : true' + - message: Within backendRefs, when using RequestRedirect + filter with path.replacePrefixMatch, exactly one PathPrefix + match must be specified + rule: '(has(self.backendRefs) && self.backendRefs.exists_one(b, + (has(b.filters) && b.filters.exists_one(f, has(f.requestRedirect) + && has(f.requestRedirect.path) && f.requestRedirect.path.type + == ''ReplacePrefixMatch'' && has(f.requestRedirect.path.replacePrefixMatch))) + )) ? ((size(self.matches) != 1 || !has(self.matches[0].path) + || self.matches[0].path.type != ''PathPrefix'') ? false + : true) : true' + - message: Within backendRefs, When using URLRewrite filter + with path.replacePrefixMatch, exactly one PathPrefix + match must be specified + rule: '(has(self.backendRefs) && self.backendRefs.exists_one(b, + (has(b.filters) && b.filters.exists_one(f, has(f.urlRewrite) + && has(f.urlRewrite.path) && f.urlRewrite.path.type + == ''ReplacePrefixMatch'' && has(f.urlRewrite.path.replacePrefixMatch))) + )) ? ((size(self.matches) != 1 || !has(self.matches[0].path) + || self.matches[0].path.type != ''PathPrefix'') ? false + : true) : true' + maxItems: 16 + type: array + x-kubernetes-validations: + - message: While 16 rules and 64 matches per rule are allowed, + the total number of matches across all rules in a route + must be less than 128 + rule: '(self.size() > 0 ? self[0].matches.size() : 0) + + (self.size() > 1 ? self[1].matches.size() : 0) + (self.size() + > 2 ? self[2].matches.size() : 0) + (self.size() > 3 ? + self[3].matches.size() : 0) + (self.size() > 4 ? self[4].matches.size() + : 0) + (self.size() > 5 ? self[5].matches.size() : 0) + + (self.size() > 6 ? self[6].matches.size() : 0) + (self.size() + > 7 ? self[7].matches.size() : 0) + (self.size() > 8 ? + self[8].matches.size() : 0) + (self.size() > 9 ? self[9].matches.size() + : 0) + (self.size() > 10 ? self[10].matches.size() : 0) + + (self.size() > 11 ? self[11].matches.size() : 0) + (self.size() + > 12 ? self[12].matches.size() : 0) + (self.size() > 13 + ? self[13].matches.size() : 0) + (self.size() > 14 ? self[14].matches.size() + : 0) + (self.size() > 15 ? self[15].matches.size() : 0) + <= 128' + type: object + status: + properties: + parents: + items: + properties: + conditions: + items: + properties: + lastTransitionTime: + format: date-time + type: string + message: + maxLength: 32768 + type: string + observedGeneration: + format: int64 + minimum: 0 + type: integer + reason: + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + enum: + - "True" + - "False" + - Unknown + type: string + type: + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + minItems: 1 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + controllerName: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ + type: string + parentRef: + properties: + group: + default: gateway.networking.k8s.io + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: Gateway + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + maxLength: 253 + minLength: 1 + type: string + namespace: + maxLength: 63 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ + type: string + port: + format: int32 + maximum: 65535 + minimum: 1 + type: integer + sectionName: + maxLength: 253 + minLength: 1 + pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + required: + - name + type: object + required: + - controllerName + - parentRef + type: object + maxItems: 32 + type: array + required: + - parents + type: object + required: + - spec + type: object rayClusterConfig: properties: autoscalerOptions: @@ -8241,6 +10000,25 @@ spec: type: integer upgradeStrategy: properties: + incrementalUpgradeOptions: + properties: + gatewayClassName: + type: string + intervalSeconds: + format: int32 + type: integer + maxSurgePercent: + default: 100 + format: int32 + type: integer + stepSizePercent: + format: int32 + type: integer + required: + - gatewayClassName + - intervalSeconds + - stepSizePercent + type: object type: type: string type: object @@ -8269,6 +10047,9 @@ spec: type: string type: object type: object + lastTrafficMigratedTime: + format: date-time + type: string rayClusterName: type: string rayClusterStatus: @@ -8383,6 +10164,12 @@ spec: type: string type: object type: object + targetCapacity: + format: int32 + type: integer + trafficRoutedPercent: + format: int32 + type: integer type: object conditions: items: @@ -8420,9 +10207,6 @@ spec: - type type: object type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map lastUpdateTime: format: date-time type: string @@ -8452,6 +10236,9 @@ spec: type: string type: object type: object + lastTrafficMigratedTime: + format: date-time + type: string rayClusterName: type: string rayClusterStatus: @@ -8566,6 +10353,12 @@ spec: type: string type: object type: object + targetCapacity: + format: int32 + type: integer + trafficRoutedPercent: + format: int32 + type: integer type: object serviceStatus: type: string diff --git a/ray-operator/config/rbac/role.yaml b/ray-operator/config/rbac/role.yaml index ba840f0c27f..3d0fc924b13 100644 --- a/ray-operator/config/rbac/role.yaml +++ b/ray-operator/config/rbac/role.yaml @@ -107,6 +107,19 @@ rules: - patch - update - watch +- apiGroups: + - gateway.networking.k8s.io + resources: + - gateways + - httproutes + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - networking.k8s.io resources: diff --git a/ray-operator/controllers/ray/common/association.go b/ray-operator/controllers/ray/common/association.go index 63eefa94bc4..1f2ce2a9270 100644 --- a/ray-operator/controllers/ray/common/association.go +++ b/ray-operator/controllers/ray/common/association.go @@ -203,3 +203,17 @@ func RayClusterNetworkResourcesOptions(instance *rayv1.RayCluster) AssociationOp }, } } + +func RayServiceGatewayNamespacedName(rayService *rayv1.RayService) types.NamespacedName { + return types.NamespacedName{ + Name: fmt.Sprintf("%s-%s", rayService.Name, "gateway"), + Namespace: rayService.Namespace, + } +} + +func RayServiceHTTPRouteNamespacedName(rayService *rayv1.RayService) types.NamespacedName { + return types.NamespacedName{ + Name: fmt.Sprintf("httproute-%s", rayService.Name), + Namespace: rayService.Namespace, + } +} diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 7a1a50a36f6..673ed83b3f8 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -21,6 +21,7 @@ import ( "k8s.io/apimachinery/pkg/util/yaml" "k8s.io/client-go/tools/record" "k8s.io/utils/lru" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -28,6 +29,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" @@ -90,6 +92,8 @@ func NewRayServiceReconciler(_ context.Context, mgr manager.Manager, provider ut // +kubebuilder:rbac:groups=core,resources=services/proxy,verbs=get;update;patch // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;create;update // +kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;watch;create;delete +// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=gateways,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=httproutes,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles,verbs=get;list;watch;create;delete;update // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=get;list;watch;create;delete @@ -142,10 +146,30 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } + // Check if IncrementalUpgrade is enabled, if so reconcile Gateway objects. + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && activeRayClusterInstance != nil { + // Creates a Gateway CR that points to the head services of + // the active and pending (if it exists) RayClusters. For incremental upgrades, + // the Gateway endpoint is used rather than the Serve service. + gateway, err := r.reconcileGateway(ctx, rayServiceInstance) + if err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } + rayServiceInstance.Spec.Gateway = gateway + // Create or update the HTTPRoute attached to this RayService's Gateway + httpRoute, err := r.reconcileHTTPRoute(ctx, rayServiceInstance) + if err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } + rayServiceInstance.Spec.HTTPRoute = httpRoute + } + // Reconcile serve applications for active and/or pending clusters // 1. If there is a pending cluster, reconcile serve applications for the pending cluster. // 2. If there are both active and pending clusters, reconcile serve applications for the pending cluster only. // 3. If there is no pending cluster, reconcile serve applications for the active cluster. + // 4. During an IncrementalUpgrade, reconcileServe will reconcile either the pending or active cluster based + // on total TargetCapacity. var isActiveClusterReady, isPendingClusterReady bool = false, false var activeClusterServeApplications, pendingClusterServeApplications map[string]rayv1.AppStatus = nil, nil if pendingRayClusterInstance != nil { @@ -278,6 +302,21 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn } logger.Info("Preparing a new pending RayCluster instance by setting RayClusterName", "clusterName", rayServiceInstance.Status.PendingServiceStatus.RayClusterName) + + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + // Set IncrementalUpgrade related Status fields for new pending RayCluster if enabled + if rayServiceInstance.Status.ActiveServiceStatus.RayClusterName == "" { + // If no Active RayCluster exists - default to starting with 100% TargetCapacity. + if rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity == nil { + rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(100)) + } + } else if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) { + // Pending RayCluster during an upgrade should start with 0% TargetCapacity. + if rayServiceInstance.Status.PendingServiceStatus.TargetCapacity == nil { + rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(0)) + } + } + } } serveEndPoints := &corev1.Endpoints{} @@ -302,6 +341,7 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RayServiceReady)) { rayServiceInstance.Status.ServiceStatus = rayv1.Running } + return nil } @@ -392,8 +432,8 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra if upgradeStrategy != nil { upgradeType := upgradeStrategy.Type if upgradeType != nil { - if *upgradeType != rayv1.NewCluster { - logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to NewCluster.") + if *upgradeType != rayv1.NewCluster && *upgradeType != rayv1.IncrementalUpgrade { + logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to %s or %s.", string(rayv1.NewCluster), string(rayv1.IncrementalUpgrade)) return false } return true @@ -407,6 +447,282 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra return true } +func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayService) (*gwv1.Gateway, error) { + options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) + if options == nil { + return nil, errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade") + } + gatewayName := rayServiceInstance.Name + "-gateway" + + // Define the desired Gateway object + rayServiceGateway := &gwv1.Gateway{ + ObjectMeta: metav1.ObjectMeta{ + Name: gatewayName, + Namespace: rayServiceInstance.Namespace, + }, + Spec: gwv1.GatewaySpec{ + GatewayClassName: gwv1.ObjectName(options.GatewayClassName), + }, + } + + rayServiceGateway.Spec.Listeners = utils.GetGatewayListenersForRayService(rayServiceInstance) + + return rayServiceGateway, nil +} + +// `reconcileGateway` reconciles a Gateway resource for a RayService. The possible cases are: +// (1) Create a new Gateway instance. (2) Update the Gateway instance if RayService has updated. (3) Do nothing. +func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.Gateway, error) { + logger := ctrl.LoggerFrom(ctx) + var err error + + // Construct desired Gateway object for RayService + desiredGateway, err := r.createGateway(rayServiceInstance) + if err != nil { + logger.Error(err, "Failed to build Gateway object for Rayservice") + return nil, err + } + + // Check for existing RayService Gateway, create the desired Gateway if none is found + existingGateway := &gwv1.Gateway{} + if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), existingGateway); err != nil { + if errors.IsNotFound(err) { + // Set the ownership in order to do the garbage collection by k8s. + if err := ctrl.SetControllerReference(rayServiceInstance, desiredGateway, r.Scheme); err != nil { + return nil, err + } + logger.Info("Creating a new Gateway instance", "Gateway Listeners", desiredGateway.Spec.Listeners) + if err := r.Create(ctx, desiredGateway); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateGateway), "Failed to create Gateway for RayService %s/%s: %v", desiredGateway.Namespace, desiredGateway.Name, err) + return nil, err + } + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedRayCluster), "Created Gateway for RayService %s/%s", desiredGateway.Namespace, desiredGateway.Name) + return desiredGateway, nil + } + return nil, err + } + + // If Gateway already exists, check if update is needed to reach desired state + if !reflect.DeepEqual(existingGateway.Spec, desiredGateway.Spec) { + logger.Info("Updating existing Gateway", "name", existingGateway.Name) + existingGateway.Spec = desiredGateway.Spec + if err := r.Update(ctx, existingGateway); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateGateway), "Failed to update the Gateway %s/%s: %v", existingGateway.Namespace, existingGateway.Name, err) + } + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedGateway), "Updated the Gateway %s/%s", existingGateway.Namespace, existingGateway.Name) + } + + return existingGateway, nil +} + +// createHTTPRoute creates a desired HTTPRoute object based on a given RayService instance with +// weights based on TrafficRoutedPercent. +func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.HTTPRoute, error) { + logger := ctrl.LoggerFrom(ctx) + + // Retrieve Gateway instance to attach this HTTPRoute to + gatewayInstance := &gwv1.Gateway{} + if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil { + return nil, err + } + + // Define the desired HTTPRoute name and basic object + httpRouteName := fmt.Sprintf("httproute-%s", rayServiceInstance.Name) + desiredHTTPRoute := &gwv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{ + Name: httpRouteName, + Namespace: rayServiceInstance.Namespace, + }, + Spec: gwv1.HTTPRouteSpec{ + CommonRouteSpec: gwv1.CommonRouteSpec{ + ParentRefs: []gwv1.ParentReference{ + { + Name: gwv1.ObjectName(gatewayInstance.Name), + Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)), + }, + }, + }, + }, + } + + // Retrieve the active RayCluster + activeRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServiceActiveRayClusterNamespacedName(rayServiceInstance)) + if err != nil || activeRayCluster == nil || activeRayCluster.Status.Head.ServiceName == "" { + logger.Info("No active RayCluster, skipping HTTPRoute creation") + return nil, err + } + oldClusterHeadSvcName := activeRayCluster.Status.Head.ServiceName + oldHeadSvc := &corev1.Service{} + if err := r.Get(ctx, client.ObjectKey{Name: oldClusterHeadSvcName, Namespace: rayServiceInstance.Namespace}, oldHeadSvc); err != nil { + logger.Error(err, "Failed to retrieve active RayCluster head service") + return nil, err + } + + // Attempt to retrieve pending RayCluster + pendingRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServicePendingRayClusterNamespacedName(rayServiceInstance)) + hasPendingCluster := (err == nil && pendingRayCluster != nil && pendingRayCluster.Status.Head.ServiceName != "") + if err != nil && !errors.IsNotFound(err) { + logger.Info("Failed to retrieve pending RayCluster.") + } + + activeServiceStatus := rayServiceInstance.Status.ActiveServiceStatus + + var backendRefs []gwv1.HTTPBackendRef + + // Configure HTTPRoute to split traffic between active and pending clusters during an incremental upgrade + if hasPendingCluster { + newClusterHeadSvcName := pendingRayCluster.Status.Head.ServiceName + newHeadSvc := &corev1.Service{} + if err := r.Get(ctx, client.ObjectKey{Name: newClusterHeadSvcName, Namespace: rayServiceInstance.Namespace}, newHeadSvc); err != nil { + logger.Error(err, "Failed to retrieve pending RayCluster head service") + return nil, err + } + + options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) + if options == nil { + return nil, errstd.New("Missing RayService IncrementalUpgradeOptions") + } + + // Retrieve TrafficRoutedPercent for old and upgraded RayClusters. + pendingServiceStatus := rayServiceInstance.Status.PendingServiceStatus + newClusterWeight := pendingServiceStatus.TrafficRoutedPercent + oldClusterWeight := activeServiceStatus.TrafficRoutedPercent + + // If IntervalSeconds has passed since LastTrafficMigratedTime, migrate + // StepSizePercent traffic to the pending cluster. + intervalSeconds := time.Duration(*options.IntervalSeconds) * time.Second + lastTrafficMigratedTime := pendingServiceStatus.LastTrafficMigratedTime + if (newClusterWeight != nil && oldClusterWeight != nil) && (lastTrafficMigratedTime == nil || time.Since(lastTrafficMigratedTime.Time) >= intervalSeconds) { + // Wait an initial iteration before migrating StepSizePercent. + if lastTrafficMigratedTime != nil { + logger.Info("Updating cluster weights by StepSizePercent each") + oldClusterWeight = ptr.To(max(*oldClusterWeight-*options.StepSizePercent, 0)) + newClusterWeight = ptr.To(min(*newClusterWeight+*options.StepSizePercent, 100)) + } + rayServiceInstance.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} + rayServiceInstance.Status.ActiveServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} + } + + // Set weights for initial iteration. + if newClusterWeight == nil { + // Pending RayCluster should scale up from 0 TrafficRoutedPercent. + newClusterWeight = ptr.To(int32(0)) + } + if oldClusterWeight == nil { + // Active RayCluster should scale down from 100 TrafficRoutedPercent. + oldClusterWeight = ptr.To(int32(100)) + } + // HTTPRoute weights should never exceed current TargetCapacity for each cluster. + newClusterTargetCapacity := pendingServiceStatus.TargetCapacity + oldClusterTargetCapacity := activeServiceStatus.TargetCapacity + if newClusterTargetCapacity != nil { + newClusterWeight = ptr.To(min(*newClusterWeight, *newClusterTargetCapacity)) + } + if oldClusterTargetCapacity != nil { + oldClusterWeight = ptr.To(min(*oldClusterWeight, *oldClusterTargetCapacity)) + } + + backendRefs = []gwv1.HTTPBackendRef{ + { + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{ + Name: gwv1.ObjectName(oldClusterHeadSvcName), + Namespace: ptr.To(gwv1.Namespace(rayServiceInstance.Namespace)), + Port: ptr.To(gwv1.PortNumber(8000)), // set to Serve port + }, + Weight: oldClusterWeight, + }, + }, + { + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{ + Name: gwv1.ObjectName(newClusterHeadSvcName), + Namespace: ptr.To(gwv1.Namespace(rayServiceInstance.Namespace)), + Port: ptr.To(gwv1.PortNumber(8000)), + }, + Weight: newClusterWeight, + }, + }, + } + logger.Info("Updating TrafficRoutedPercent to", "oldClusterWeight", oldClusterWeight, "newClusterWeight", newClusterWeight) + rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent = oldClusterWeight + rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent = newClusterWeight + } else { + // No pending cluster — route 100% to active RayCluster + backendRefs = []gwv1.HTTPBackendRef{ + { + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{ + Name: gwv1.ObjectName(oldClusterHeadSvcName), + Namespace: ptr.To(gwv1.Namespace(rayServiceInstance.Namespace)), + Port: ptr.To(gwv1.PortNumber(8000)), + }, + Weight: ptr.To(int32(100)), + }, + }, + } + rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(int32(100)) + } + + desiredHTTPRoute.Spec.Rules = []gwv1.HTTPRouteRule{ + { + Matches: []gwv1.HTTPRouteMatch{ + { + Path: &gwv1.HTTPPathMatch{ + Type: ptr.To(gwv1.PathMatchPathPrefix), + Value: ptr.To("/"), + }, + }, + }, + BackendRefs: backendRefs, + }, + } + + return desiredHTTPRoute, nil +} + +// reconcileHTTPRoute reconciles a HTTPRoute resource for a RayService to route traffic during an IncrementalUpgrade. +func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.HTTPRoute, error) { + logger := ctrl.LoggerFrom(ctx) + var err error + + desiredHTTPRoute, err := r.createHTTPRoute(ctx, rayServiceInstance) + if err != nil { + logger.Error(err, "Failed to build HTTPRoute for RayService upgrade") + return nil, err + } + + // Check for existing HTTPRoute for RayService + existingHTTPRoute := &gwv1.HTTPRoute{} + if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), existingHTTPRoute); err != nil { + if errors.IsNotFound(err) { + // Set the ownership in order to do the garbage collection by k8s. + if err := ctrl.SetControllerReference(rayServiceInstance, desiredHTTPRoute, r.Scheme); err != nil { + return nil, err + } + if err = r.Create(ctx, desiredHTTPRoute); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateHTTPRoute), "Failed to create the HTTPRoute for RayService %s/%s: %v", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name, err) + return nil, err + } + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.FailedToCreateHTTPRoute), "Created HTTPRoute for RayService %s/%s", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name) + return desiredHTTPRoute, nil + } + return nil, err + } + + // If HTTPRoute already exists, check if update is needed + if !reflect.DeepEqual(existingHTTPRoute.Spec, desiredHTTPRoute.Spec) { + logger.Info("Updating existing HTTPRoute", "name", desiredHTTPRoute.Name) + existingHTTPRoute.Spec = desiredHTTPRoute.Spec + if err := r.Update(ctx, existingHTTPRoute); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateHTTPRoute), "Failed to update the HTTPRoute %s/%s: %v", existingHTTPRoute.Namespace, existingHTTPRoute.Name, err) + } + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedHTTPRoute), "Updated the HTTPRoute %s/%s", existingHTTPRoute.Namespace, existingHTTPRoute.Name) + } + + return existingHTTPRoute, nil +} + // `reconcileRayCluster` reconciles the active and pending Ray clusters. There are 4 possible cases: // (1) Create a new pending cluster. (2) Update the active cluster. (3) Update the pending cluster. (4) Do nothing. func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServiceInstance *rayv1.RayService) (*rayv1.RayCluster, *rayv1.RayCluster, error) { @@ -767,6 +1083,151 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer return nil } +// checkIfNeedIncrementalUpgradeUpdate returns whether the controller should adjust the target_capacity +// of the Serve config associated with a RayCluster during an IncrementalUpgrade. +func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.Context, rayServiceInstance *rayv1.RayService) (bool, string) { + activeRayServiceStatus := rayServiceInstance.Status.ActiveServiceStatus + pendingRayServiceStatus := rayServiceInstance.Status.PendingServiceStatus + + if activeRayServiceStatus.RayClusterName == "" || pendingRayServiceStatus.RayClusterName == "" { + return false, "Both active and pending RayCluster instances required for incremental upgrade." + } + + // Validate Gateway and HTTPRoute objects are ready + gatewayInstance := &gwv1.Gateway{} + if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil { + return false, "Failed to retrieve Gateway for RayService." + } + if !utils.IsGatewayReady(gatewayInstance) { + return false, "Gateway for RayService IncrementalUpgrade is not ready." + } + + httpRouteInstance := &gwv1.HTTPRoute{} + if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), httpRouteInstance); err != nil { + return false, "Failed to retrieve HTTPRoute for RayService." + } + if !utils.IsHTTPRouteReady(gatewayInstance, httpRouteInstance) { + return false, "HTTPRoute for RayService IncrementalUpgrade is not ready." + } + + // Retrieve the current observed IncrementalUpgrade Status fields for each RayService. + if activeRayServiceStatus.TargetCapacity == nil || activeRayServiceStatus.TrafficRoutedPercent == nil { + return true, "Active RayServiceStatus missing TargetCapacity or TrafficRoutedPercent." + } + if pendingRayServiceStatus.TargetCapacity == nil || pendingRayServiceStatus.TrafficRoutedPercent == nil { + return true, "Pending RayServiceStatus missing TargetCapacity or TrafficRoutedPercent." + } + activeTargetCapacity := int(*activeRayServiceStatus.TargetCapacity) + pendingTargetCapacity := int(*pendingRayServiceStatus.TargetCapacity) + pendingTrafficRoutedPercent := int(*pendingRayServiceStatus.TrafficRoutedPercent) + + if pendingTargetCapacity < 100 || pendingTrafficRoutedPercent < 100 { + return true, "Pending RayCluster has not finished scaling up." + } else if activeTargetCapacity == 0 && pendingTargetCapacity == 100 { + return false, "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete." + } + return true, "Active RayCluster TargetCapacity has not finished scaling down." +} + +// updateServeTargetCapacity reconcile the target_capacity of the ServeConfig for a given RayCluster during +// an IncrementalUpgrade while also updating the Status.TargetCapacity of the Active and Pending RayServices. +func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayDashboardClient utils.RayDashboardClientInterface) error { + logger := ctrl.LoggerFrom(ctx) + logger.Info("reconcileServeTargetCapacity", "RayService", rayServiceInstance.Name) + + if !utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + return nil + } + + activeRayServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus + pendingRayServiceStatus := &rayServiceInstance.Status.PendingServiceStatus + + // Set initial TargetCapacity values if unset + if activeRayServiceStatus.TargetCapacity == nil { + activeRayServiceStatus.TargetCapacity = ptr.To(int32(100)) + } + if pendingRayServiceStatus.TargetCapacity == nil { + pendingRayServiceStatus.TargetCapacity = ptr.To(int32(0)) + } + + // Retrieve the current observed Status fields for IncrementalUpgrade + activeTargetCapacity := *activeRayServiceStatus.TargetCapacity + pendingTargetCapacity := *pendingRayServiceStatus.TargetCapacity + pendingTrafficRoutedPercent := *pendingRayServiceStatus.TrafficRoutedPercent + + // Defer updating the target_capacity until traffic weights are updated + if pendingTargetCapacity != pendingTrafficRoutedPercent { + logger.Info("Traffic is currently being migrated to pending cluster", "RayCluster", pendingRayServiceStatus.RayClusterName, "TargetCapacity", pendingTargetCapacity, "TrafficRoutedPercent", pendingTrafficRoutedPercent) + return nil + } + + // Retrieve MaxSurgePercent - the maximum amount to change TargetCapacity by + options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) + if options == nil { + return errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade") + } + maxSurgePercent := *options.MaxSurgePercent + + // There are two cases: + // 1. The total target_capacity is greater than 100. This means the pending RayCluster has + // scaled up traffic and the active RayCluster can be scaled down by MaxSurgePercent. + // 2. The total target_capacity is equal to 100. This means the pending RayCluster can + // increase its target_capacity by MaxSurgePercent. + var clusterName string + var goalTargetCapacity int32 + if activeTargetCapacity+pendingTargetCapacity > int32(100) { + // Scale down the Active RayCluster TargetCapacity on this iteration. + goalTargetCapacity = max(int32(0), activeTargetCapacity-maxSurgePercent) + clusterName = activeRayServiceStatus.RayClusterName + activeRayServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) + logger.Info("Setting target_capacity for active Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity) + } else { + // Scale up the Pending RayCluster TargetCapacity on this iteration. + goalTargetCapacity = min(int32(100), pendingTargetCapacity+maxSurgePercent) + clusterName = pendingRayServiceStatus.RayClusterName + pendingRayServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) + logger.Info("Setting target_capacity for pending Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity) + } + + // Retrieve cached ServeConfig from last reconciliation for cluster to update + cachedConfig := r.getServeConfigFromCache(rayServiceInstance, clusterName) + if cachedConfig == "" { + cachedConfig = rayServiceInstance.Spec.ServeConfigV2 + } + logger.Info("Retrieving ServeConfig", "cached", cachedConfig, "ServeConfigV2", rayServiceInstance.Spec.ServeConfigV2) + serveConfig := make(map[string]interface{}) + if err := yaml.Unmarshal([]byte(cachedConfig), &serveConfig); err != nil { + return err + } + + // Check if ServeConfig requires update + if currentTargetCapacity, ok := serveConfig["target_capacity"].(float64); ok { + if int32(currentTargetCapacity) == goalTargetCapacity { + // No update required, return early + return nil + } + } + + // Otherwise, update the target_capacity for the cached ServeConfig + serveConfig["target_capacity"] = goalTargetCapacity + configJson, err := json.Marshal(serveConfig) + if err != nil { + return fmt.Errorf("failed to marshal converted serve config into bytes: %w", err) + } + logger.Info("reconcileServeTargetCapacity", "MULTI_APP json config", string(configJson)) + if err := rayDashboardClient.UpdateDeployments(ctx, configJson); err != nil { + err = fmt.Errorf( + "fail to create / update target_capacity for Serve applications. err: %w", err) + return err + } + + // Only update the target_capacity of one RayCluster at a time. + r.cacheServeConfig(rayServiceInstance, clusterName) + logger.Info("reconcileServeTargetCapacity", "message", "Cached Serve config for Ray cluster with the key", "rayClusterName", clusterName) + + return nil +} + // `getAndCheckServeStatus` gets Serve applications' and deployments' statuses and check whether the // Serve applications are ready to serve incoming traffic or not. It returns three values: // @@ -965,6 +1426,22 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeApplications), "Updated serve applications to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name) } + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) { + incrementalUpgradeUpdate, reason := r.checkIfNeedIncrementalUpgradeUpdate(ctx, rayServiceInstance) + logger.Info("checkIfNeedIncrementalUpgradeUpdate", "incrementalUpgradeUpdate", incrementalUpgradeUpdate, "reason", reason) + if incrementalUpgradeUpdate { + if err := r.reconcileServeTargetCapacity(ctx, rayServiceInstance, rayDashboardClient); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateTargetCapacity), "Failed to update target_capacity of serve applications to the RayService %s/%s: %v", rayServiceInstance.Namespace, rayServiceInstance.Name, err) + return false, serveApplications, err + } + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeTargetCapacity), + "Updated target_capacity of serve applications to the RayService %s/%s", rayServiceInstance.Namespace, rayServiceInstance.Name) + + // Don't switch to the pending RayCluster until IncrementalUpgrade is complete. + return false, serveApplications, nil + } + } + return isReady, serveApplications, nil } diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 638af6b26fb..1b2b4679061 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -17,9 +17,11 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" + "k8s.io/utils/lru" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" @@ -27,6 +29,7 @@ import ( "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient" utiltypes "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/types" "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme" + "github.com/ray-project/kuberay/ray-operator/pkg/features" "github.com/ray-project/kuberay/ray-operator/test/support" ) @@ -1319,3 +1322,681 @@ func TestRayClusterDeletionDelaySeconds(t *testing.T) { }) } } + +// Helper function to create a RayService object undergoing an incremental upgrade. +func makeIncrementalUpgradeRayService( + withOptions bool, + gatewayClassName string, + stepSizePercent *int32, + intervalSeconds *int32, + routedPercent *int32, + lastTrafficMigratedTime *metav1.Time, +) *rayv1.RayService { + spec := rayv1.RayServiceSpec{ + ServeService: &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "serve-service", + Namespace: "test-ns", + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{ + { + Name: "http", + Port: 8000, + }, + }, + }, + }, + } + if withOptions { + spec.UpgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + GatewayClassName: gatewayClassName, + StepSizePercent: stepSizePercent, + IntervalSeconds: intervalSeconds, + }, + } + } + + return &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "incremental-ray-service", + Namespace: "test-ns", + }, + Spec: spec, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: "active-ray-cluster", + RayClusterStatus: rayv1.RayClusterStatus{ + Head: rayv1.HeadInfo{ServiceName: "active-service"}, + }, + TrafficRoutedPercent: routedPercent, + LastTrafficMigratedTime: lastTrafficMigratedTime, + }, + PendingServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: "pending-ray-cluster", + RayClusterStatus: rayv1.RayClusterStatus{ + Head: rayv1.HeadInfo{ServiceName: "pending-service"}, + }, + TrafficRoutedPercent: ptr.To(int32(100) - *routedPercent), + LastTrafficMigratedTime: lastTrafficMigratedTime, + }, + }, + } +} + +func TestCreateGateway(t *testing.T) { + serveService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "serve-service", + Namespace: "test-ns", + }, + Spec: corev1.ServiceSpec{ + Ports: []corev1.ServicePort{ + { + Port: 8000, + }, + }, + }, + } + newScheme := runtime.NewScheme() + _ = corev1.AddToScheme(newScheme) + + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(serveService).Build() + reconciler := &RayServiceReconciler{ + Client: fakeClient, + } + + tests := []struct { + rayService *rayv1.RayService + name string + expectedGatewayName string + expectedClass string + expectedListeners int + expectErr bool + }{ + { + name: "valid gateway creation", + expectedGatewayName: "incremental-ray-service-gateway", + rayService: makeIncrementalUpgradeRayService(true, "gateway-class", ptr.To(int32(50)), ptr.To(int32(10)), ptr.To(int32(80)), &metav1.Time{Time: time.Now()}), + expectErr: false, + expectedClass: "gateway-class", + expectedListeners: 1, + }, + { + name: "missing IncrementalUpgradeOptions", + rayService: makeIncrementalUpgradeRayService(false, "gateway-class", ptr.To(int32(0)), ptr.To(int32(0)), ptr.To(int32(0)), &metav1.Time{Time: time.Now()}), + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gw, err := reconciler.createGateway(tt.rayService) + if tt.expectErr { + require.Error(t, err) + assert.Nil(t, gw) + } else { + require.NoError(t, err) + require.NotNil(t, gw) + assert.Equal(t, tt.expectedGatewayName, gw.Name) + assert.Equal(t, tt.rayService.Namespace, gw.Namespace) + assert.Equal(t, gwv1.ObjectName(tt.expectedClass), gw.Spec.GatewayClassName) + assert.Len(t, gw.Spec.Listeners, tt.expectedListeners) + } + }) + } +} + +func TestCreateHTTPRoute(t *testing.T) { + // Create re-used runtime objects for test cases + activeService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "active-service", + Namespace: "test-ns", + }, + } + pendingService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pending-service", + Namespace: "test-ns", + }, + } + activeCluster := &rayv1.RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "active-ray-cluster", + Namespace: "test-ns", + }, + Status: rayv1.RayClusterStatus{ + Head: rayv1.HeadInfo{ + ServiceName: "active-service", + }, + }, + } + pendingCluster := &rayv1.RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pending-ray-cluster", + Namespace: "test-ns", + }, + Status: rayv1.RayClusterStatus{ + Head: rayv1.HeadInfo{ + ServiceName: "pending-service", + }, + }, + } + gateway := &gwv1.Gateway{ + ObjectMeta: metav1.ObjectMeta{ + Name: "incremental-ray-service-gateway", + Namespace: "test-ns", + }, + } + + tests := []struct { + name string + rayService *rayv1.RayService + runtimeObjects []runtime.Object + routedPercent int32 + expectError bool + }{ + { + name: "valid HTTPRoute creation", + routedPercent: int32(80), + rayService: makeIncrementalUpgradeRayService(true, "gateway-class", ptr.To(int32(50)), ptr.To(int32(1000)), ptr.To(int32(80)), &metav1.Time{Time: time.Now()}), + runtimeObjects: []runtime.Object{activeService, pendingService, pendingCluster, activeCluster, gateway}, + expectError: false, + }, + { + name: "missing IncrementalUpgradeOptions", + routedPercent: int32(50), + rayService: makeIncrementalUpgradeRayService(false, "gateway-class", ptr.To(int32(50)), ptr.To(int32(120)), ptr.To(int32(50)), &metav1.Time{Time: time.Now()}), + runtimeObjects: []runtime.Object{activeService, pendingService, pendingCluster, activeCluster, gateway}, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + newScheme := runtime.NewScheme() + _ = corev1.AddToScheme(newScheme) + _ = rayv1.AddToScheme(newScheme) + _ = gwv1.AddToScheme(newScheme) + + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build() + reconciler := RayServiceReconciler{ + Client: fakeClient, + Scheme: newScheme, + Recorder: record.NewFakeRecorder(1), + } + ctx := context.TODO() + + route, err := reconciler.createHTTPRoute(ctx, tt.rayService) + if tt.expectError { + require.Error(t, err) + assert.Nil(t, route) + } else { + require.NoError(t, err) + require.NotNil(t, route) + + assert.Equal(t, "httproute-incremental-ray-service", route.Name) + assert.Equal(t, "test-ns", route.Namespace) + + require.Len(t, route.Spec.Rules, 1) + rule := route.Spec.Rules[0] + require.Len(t, rule.BackendRefs, 2) + + assert.Equal(t, gwv1.ObjectName("active-service"), rule.BackendRefs[0].BackendRef.Name) + assert.Equal(t, gwv1.ObjectName("pending-service"), rule.BackendRefs[1].BackendRef.Name) + + assert.Equal(t, tt.routedPercent, *rule.BackendRefs[0].Weight) + assert.Equal(t, int32(100)-tt.routedPercent, *rule.BackendRefs[1].Weight) + } + }) + } +} + +func TestReconcileHTTPRoute(t *testing.T) { + newScheme := runtime.NewScheme() + _ = rayv1.AddToScheme(newScheme) + _ = corev1.AddToScheme(newScheme) + _ = gwv1.AddToScheme(newScheme) + + ctx := context.TODO() + namespace := "test-ns" + + // Create runtime objects for RayService + activeService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "active-service", + Namespace: "test-ns", + }, + } + pendingService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pending-service", + Namespace: "test-ns", + }, + } + activeCluster := &rayv1.RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "active-ray-cluster", + Namespace: "test-ns", + }, + Status: rayv1.RayClusterStatus{ + Head: rayv1.HeadInfo{ + ServiceName: "active-service", + }, + }, + } + pendingCluster := &rayv1.RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pending-ray-cluster", + Namespace: "test-ns", + }, + Status: rayv1.RayClusterStatus{ + Head: rayv1.HeadInfo{ + ServiceName: "pending-service", + }, + }, + } + gateway := &gwv1.Gateway{ + ObjectMeta: metav1.ObjectMeta{ + Name: "incremental-ray-service-gateway", + Namespace: namespace, + }, + } + + // Prepare RayService instance + rayService := makeIncrementalUpgradeRayService(true, "test-gateway", ptr.To(int32(20)), ptr.To(int32(30)), ptr.To(int32(80)), ptr.To(metav1.Time{Time: time.Now()})) + runtimeObjects := []runtime.Object{rayService, activeService, pendingService, activeCluster, pendingCluster, gateway} + + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() + reconciler := RayServiceReconciler{ + Client: fakeClient, + Scheme: newScheme, + Recorder: record.NewFakeRecorder(10), + } + + tests := []struct { + name string + setupHTTPRoute func(r *RayServiceReconciler, rs *rayv1.RayService) *gwv1.HTTPRoute + expectedRouteName string + expectedWeight int32 + }{ + { + name: "creates new HTTPRoute if Spec.HTTPRoute is nil", + setupHTTPRoute: func(_ *RayServiceReconciler, rs *rayv1.RayService) *gwv1.HTTPRoute { + rs.Spec.HTTPRoute = nil + return nil + }, + expectedRouteName: "httproute-incremental-ray-service", + expectedWeight: 80, + }, + { + name: "updates existing HTTPRoute if spec differs", + setupHTTPRoute: func(r *RayServiceReconciler, rs *rayv1.RayService) *gwv1.HTTPRoute { + desired, err := r.createHTTPRoute(ctx, rs) + require.NoError(t, err) + + // Modify weight to trigger update + existing := desired.DeepCopy() + existing.Spec.Rules[0].BackendRefs[0].Weight = ptr.To(int32(5)) + rs.Spec.HTTPRoute = existing + return desired + }, + expectedRouteName: "httproute-incremental-ray-service", + expectedWeight: 80, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.setupHTTPRoute(&reconciler, rayService) + + route, err := reconciler.reconcileHTTPRoute(ctx, rayService) + require.NoError(t, err) + require.NotNil(t, route) + assert.Equal(t, tt.expectedRouteName, route.Name) + assert.Equal(t, namespace, route.Namespace) + + // Check updated weights match expected + assert.Equal(t, tt.expectedWeight, *route.Spec.Rules[0].BackendRefs[0].Weight) + assert.Equal(t, 100-tt.expectedWeight, *route.Spec.Rules[0].BackendRefs[1].Weight) + + // Check ParentRef refers to the expected Gateway + parent := route.Spec.ParentRefs[0] + assert.Equal(t, gwv1.ObjectName("incremental-ray-service-gateway"), parent.Name) + assert.Equal(t, ptr.To(gwv1.Namespace("test-ns")), parent.Namespace) + }) + } +} + +func TestReconcileGateway(t *testing.T) { + newScheme := runtime.NewScheme() + _ = rayv1.AddToScheme(newScheme) + _ = corev1.AddToScheme(newScheme) + _ = gwv1.AddToScheme(newScheme) + + ctx := context.TODO() + namespace := "test-ns" + + // Prepare RayService instance + rayService := makeIncrementalUpgradeRayService( + true, + "gateway-class", + ptr.To(int32(20)), + ptr.To(int32(30)), + ptr.To(int32(80)), + ptr.To(metav1.Time{Time: time.Now()}), + ) + + runtimeObjects := []runtime.Object{ + rayService, + rayService.Spec.ServeService, + } + + fakeClient := clientFake.NewClientBuilder(). + WithScheme(newScheme). + WithRuntimeObjects(runtimeObjects...). + Build() + + reconciler := RayServiceReconciler{ + Client: fakeClient, + Scheme: newScheme, + Recorder: record.NewFakeRecorder(10), + } + + tests := []struct { + name string + setupGateway func(r *RayServiceReconciler, rs *rayv1.RayService) *gwv1.Gateway + expectedGatewayName string + expectedClass string + expectedNumListeners int + }{ + { + name: "creates new Gateway if Spec.Gateway is missing during incremental upgrade", + setupGateway: func(_ *RayServiceReconciler, rs *rayv1.RayService) *gwv1.Gateway { + rs.Spec.Gateway = nil + return nil + }, + expectedGatewayName: "incremental-ray-service-gateway", + expectedClass: "gateway-class", + expectedNumListeners: 1, + }, + { + name: "update existing Gateway if desired Gateway spec differs", + setupGateway: func(r *RayServiceReconciler, rs *rayv1.RayService) *gwv1.Gateway { + desired, err := r.createGateway(rs) + require.NoError(t, err) + + existing := desired.DeepCopy() + existing.Spec.GatewayClassName = "some-other-class" + rs.Spec.Gateway = existing + return existing + }, + expectedGatewayName: "incremental-ray-service-gateway", + expectedClass: "gateway-class", + expectedNumListeners: 1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.setupGateway(&reconciler, rayService) + + gw, err := reconciler.reconcileGateway(ctx, rayService) + require.NoError(t, err) + require.NotNil(t, gw) + + assert.Equal(t, tt.expectedGatewayName, gw.Name) + assert.Equal(t, namespace, gw.Namespace) + assert.Equal(t, gwv1.ObjectName(tt.expectedClass), gw.Spec.GatewayClassName) + assert.Len(t, gw.Spec.Listeners, tt.expectedNumListeners) + }) + } +} + +func TestReconcileServeTargetCapacity(t *testing.T) { + features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true) + tests := []struct { + name string + updatedCluster string + activeCapacity int32 + pendingCapacity int32 + routedPercent int32 + maxSurgePercent int32 + expectedActiveCapacity int32 + expectedPendingCapacity int32 + }{ + { + name: "Scale up pending RayCluster when total TargetCapacity < 100", + activeCapacity: 70, + pendingCapacity: 10, + routedPercent: 10, + maxSurgePercent: 20, + expectedActiveCapacity: 70, + expectedPendingCapacity: 30, + updatedCluster: "pending", + }, + { + name: "Scale down active RayCluster when total TargetCapacity > 100", + activeCapacity: 80, + pendingCapacity: 30, + routedPercent: 30, + maxSurgePercent: 20, + expectedActiveCapacity: 60, + expectedPendingCapacity: 30, + updatedCluster: "active", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.TODO() + rayService := &rayv1.RayService{ + Spec: rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + MaxSurgePercent: ptr.To(tt.maxSurgePercent), + }, + }, + ServeConfigV2: `{"target_capacity": 0}`, + }, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: "active", + TargetCapacity: ptr.To(tt.activeCapacity), + }, + PendingServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: "pending", + TargetCapacity: ptr.To(tt.pendingCapacity), + TrafficRoutedPercent: ptr.To(tt.routedPercent), + }, + }, + } + + fakeDashboard := &utils.FakeRayDashboardClient{} + reconciler := &RayServiceReconciler{ + ServeConfigs: lru.New(10), // empty initial cache + } + err := reconciler.reconcileServeTargetCapacity(ctx, rayService, fakeDashboard) + require.NoError(t, err) + require.NotEmpty(t, fakeDashboard.LastUpdatedConfig) + + if tt.updatedCluster == "active" { + assert.Equal(t, tt.expectedActiveCapacity, *rayService.Status.ActiveServiceStatus.TargetCapacity) + assert.Equal(t, tt.pendingCapacity, *rayService.Status.PendingServiceStatus.TargetCapacity) + expectedServeConfig := `{"target_capacity":` + strconv.Itoa(int(tt.expectedActiveCapacity)) + `}` + assert.JSONEq(t, expectedServeConfig, string(fakeDashboard.LastUpdatedConfig)) + } else { + assert.Equal(t, tt.expectedPendingCapacity, *rayService.Status.PendingServiceStatus.TargetCapacity) + assert.Equal(t, tt.activeCapacity, *rayService.Status.ActiveServiceStatus.TargetCapacity) + expectedServeConfig := `{"target_capacity":` + strconv.Itoa(int(tt.expectedPendingCapacity)) + `}` + assert.JSONEq(t, expectedServeConfig, string(fakeDashboard.LastUpdatedConfig)) + } + }) + } +} + +// MakeGateway is a helper function to return an Gateway object +func makeGateway(name, namespace string, isReady bool) *gwv1.Gateway { + status := metav1.ConditionFalse + if isReady { + status = metav1.ConditionTrue + } + return &gwv1.Gateway{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Status: gwv1.GatewayStatus{ + Conditions: []metav1.Condition{ + { + Type: string(gwv1.GatewayConditionAccepted), + Status: status, + }, + }, + }, + } +} + +// MakeHTTPRoute is a helper function to return an HTTPRoute object +func makeHTTPRoute(name, namespace string, isReady bool) *gwv1.HTTPRoute { + status := metav1.ConditionFalse + if isReady { + status = metav1.ConditionTrue + } + return &gwv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Status: gwv1.HTTPRouteStatus{ + RouteStatus: gwv1.RouteStatus{ + Parents: []gwv1.RouteParentStatus{ + { + ParentRef: gwv1.ParentReference{ + Name: gwv1.ObjectName("test-rayservice-gateway"), + Namespace: ptr.To(gwv1.Namespace(namespace)), + }, + Conditions: []metav1.Condition{ + { + Type: string(gwv1.RouteConditionAccepted), + Status: status, + }, + { + Type: string(gwv1.RouteConditionResolvedRefs), + Status: status, + }, + }, + }, + }, + }, + }, + } +} + +func TestCheckIfNeedIncrementalUpgradeUpdate(t *testing.T) { + rayServiceName := "test-rayservice" + gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway") + httpRouteName := fmt.Sprintf("%s-%s", "httproute", rayServiceName) + namespace := "test-ns" + + tests := []struct { + name string + expectedReason string + runtimeObjects []runtime.Object + activeStatus rayv1.RayServiceStatus + pendingStatus rayv1.RayServiceStatus + expectedNeedsUpdate bool + }{ + { + name: "Missing RayClusterNames", + expectedNeedsUpdate: false, + expectedReason: "Both active and pending RayCluster instances required for incremental upgrade.", + }, + { + name: "Gateway not ready", + activeStatus: rayv1.RayServiceStatus{RayClusterName: "active"}, + pendingStatus: rayv1.RayServiceStatus{RayClusterName: "pending"}, + runtimeObjects: []runtime.Object{ + makeGateway(gatewayName, namespace, false), makeHTTPRoute(httpRouteName, namespace, true), + }, + expectedNeedsUpdate: false, + expectedReason: "Gateway for RayService IncrementalUpgrade is not ready.", + }, + { + name: "HTTPRoute not ready", + activeStatus: rayv1.RayServiceStatus{RayClusterName: "active"}, + pendingStatus: rayv1.RayServiceStatus{RayClusterName: "pending"}, + runtimeObjects: []runtime.Object{ + makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, false), + }, + expectedNeedsUpdate: false, + expectedReason: "HTTPRoute for RayService IncrementalUpgrade is not ready.", + }, + { + name: "Incremental upgrade is complete", + activeStatus: rayv1.RayServiceStatus{ + RayClusterName: "active", + TargetCapacity: ptr.To(int32(0)), + TrafficRoutedPercent: ptr.To(int32(0)), + }, + pendingStatus: rayv1.RayServiceStatus{ + RayClusterName: "pending", + TargetCapacity: ptr.To(int32(100)), + TrafficRoutedPercent: ptr.To(int32(100)), + }, + runtimeObjects: []runtime.Object{ + makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, true), + }, + expectedNeedsUpdate: false, + expectedReason: "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete.", + }, + { + name: "Pending RayCluster is still incrementally scaling", + activeStatus: rayv1.RayServiceStatus{ + RayClusterName: "active", + TargetCapacity: ptr.To(int32(70)), + TrafficRoutedPercent: ptr.To(int32(70)), + }, + pendingStatus: rayv1.RayServiceStatus{ + RayClusterName: "pending", + TargetCapacity: ptr.To(int32(30)), + TrafficRoutedPercent: ptr.To(int32(30)), + }, + runtimeObjects: []runtime.Object{ + makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, true), + }, + expectedNeedsUpdate: true, + expectedReason: "Pending RayCluster has not finished scaling up.", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + newScheme := runtime.NewScheme() + _ = corev1.AddToScheme(newScheme) + _ = gwv1.AddToScheme(newScheme) + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build() + // Initialize RayService reconciler. + ctx := context.TODO() + r := RayServiceReconciler{ + Client: fakeClient, + Recorder: &record.FakeRecorder{}, + Scheme: scheme.Scheme, + } + rayService := &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{Name: rayServiceName, Namespace: namespace}, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: tt.activeStatus, + PendingServiceStatus: tt.pendingStatus, + }, + } + needsUpdate, reason := r.checkIfNeedIncrementalUpgradeUpdate(ctx, rayService) + assert.Equal(t, tt.expectedNeedsUpdate, needsUpdate) + assert.Equal(t, tt.expectedReason, reason) + }) + } +} diff --git a/ray-operator/controllers/ray/utils/consistency.go b/ray-operator/controllers/ray/utils/consistency.go index 2c2ba0fe616..929e07d6658 100644 --- a/ray-operator/controllers/ray/utils/consistency.go +++ b/ray-operator/controllers/ray/utils/consistency.go @@ -29,6 +29,21 @@ func InconsistentRayClusterStatus(oldStatus rayv1.RayClusterStatus, newStatus ra if !reflect.DeepEqual(oldStatus.Conditions, newStatus.Conditions) { return true } + if features.Enabled(features.RayServiceIncrementalUpgrade) { + // Also check for changes in IncrementalUpgrade related Status fields. + if oldStatus.TrafficRoutedPercent != newStatus.TrafficRoutedPercent { + logger.Info("inconsistentRayServiceStatus RayService updated TrafficRoutedPercent", "old TrafficRoutedPercent", oldStatus.TrafficRoutedPercent, "new TrafficRoutedPercent", newStatus.TrafficRoutedPercent) + return true + } + if oldStatus.TargetCapacity != newStatus.TargetCapacity { + logger.Info("inconsistentRayServiceStatus RayService updated TargetCapacity", "old TargetCapacity", oldStatus.TargetCapacity, "new TargetCapacity", newStatus.TargetCapacity) + return true + } + if oldStatus.LastTrafficMigratedTime != newStatus.LastTrafficMigratedTime { + logger.Info("inconsistentRayServiceStatus RayService updated LastTrafficMigratedTime", "old LastTrafficMigratedTime", oldStatus.LastTrafficMigratedTime, "new LastTrafficMigratedTime", newStatus.LastTrafficMigratedTime) + return true + } + } return false } diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index 8ed43a813f6..ccdc967de86 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -319,9 +319,17 @@ const ( InvalidRayServiceSpec K8sEventType = "InvalidRayServiceSpec" InvalidRayServiceMetadata K8sEventType = "InvalidRayServiceMetadata" UpdatedHeadPodServeLabel K8sEventType = "UpdatedHeadPodServeLabel" + UpdatedGateway K8sEventType = "UpdatedGateway" + UpdatedHTTPRoute K8sEventType = "UpdatedHTTPRoute" UpdatedServeApplications K8sEventType = "UpdatedServeApplications" + UpdatedServeTargetCapacity K8sEventType = "UpdatedServeTargetCapacity" FailedToUpdateHeadPodServeLabel K8sEventType = "FailedToUpdateHeadPodServeLabel" FailedToUpdateServeApplications K8sEventType = "FailedToUpdateServeApplications" + FailedToUpdateTargetCapacity K8sEventType = "FailedToUpdateTargetCapacity" + FailedToCreateGateway K8sEventType = "FailedToCreateGateway" + FailedToUpdateGateway K8sEventType = "FailedToUpdateGateway" + FailedToCreateHTTPRoute K8sEventType = "FailedToCreateHTTPRoute" + FailedToUpdateHTTPRoute K8sEventType = "FailedToUpdateHTTPRoute" // Generic Pod event list DeletedPod K8sEventType = "DeletedPod" diff --git a/ray-operator/controllers/ray/utils/fake_serve_httpclient.go b/ray-operator/controllers/ray/utils/fake_serve_httpclient.go index 21a3fdb91be..1bf0588c403 100644 --- a/ray-operator/controllers/ray/utils/fake_serve_httpclient.go +++ b/ray-operator/controllers/ray/utils/fake_serve_httpclient.go @@ -12,9 +12,10 @@ import ( ) type FakeRayDashboardClient struct { - multiAppStatuses map[string]*utiltypes.ServeApplicationStatus - GetJobInfoMock atomic.Pointer[func(context.Context, string) (*utiltypes.RayJobInfo, error)] - serveDetails utiltypes.ServeDetails + multiAppStatuses map[string]*utiltypes.ServeApplicationStatus + GetJobInfoMock atomic.Pointer[func(context.Context, string) (*utiltypes.RayJobInfo, error)] + serveDetails utiltypes.ServeDetails + LastUpdatedConfig []byte } var _ dashboardclient.RayDashboardClientInterface = (*FakeRayDashboardClient)(nil) @@ -22,7 +23,8 @@ var _ dashboardclient.RayDashboardClientInterface = (*FakeRayDashboardClient)(ni func (r *FakeRayDashboardClient) InitClient(_ *http.Client, _ string) { } -func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, _ []byte) error { +func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, configJson []byte) error { + r.LastUpdatedConfig = configJson fmt.Print("UpdateDeployments fake succeeds.") return nil } diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 3bb63f79189..7d083a0e426 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -24,9 +24,11 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/manager" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient" + "github.com/ray-project/kuberay/ray-operator/pkg/features" ) const ( @@ -675,6 +677,83 @@ func GetRayClusterNameFromService(svc *corev1.Service) string { return svc.Spec.Selector[RayClusterLabelKey] } +func IsGatewayReady(gatewayInstance *gwv1.Gateway) bool { + if gatewayInstance == nil { + return false + } + for _, condition := range gatewayInstance.Status.Conditions { + if condition.Type == string(gwv1.GatewayConditionAccepted) && condition.Status == metav1.ConditionTrue { + return true + } + } + + // If no accepted condition found then it is not ready yet + return false +} + +// IsHTTPRouteReady returns whether the HTTPRoute associated with a given Gateway has a ready condition +func IsHTTPRouteReady(gatewayInstance *gwv1.Gateway, httpRouteInstance *gwv1.HTTPRoute) bool { + if httpRouteInstance == nil { + return false + } + for _, parent := range httpRouteInstance.Status.Parents { + if parent.ParentRef.Name != gwv1.ObjectName(gatewayInstance.Name) { + continue + } + if parent.ParentRef.Namespace != nil && *parent.ParentRef.Namespace != gwv1.Namespace(gatewayInstance.Namespace) { + continue + } + hasAccepted := false + hasResolved := false + + for _, condition := range parent.Conditions { + switch gwv1.RouteConditionType(condition.Type) { + case gwv1.RouteConditionAccepted: + if condition.Status == metav1.ConditionTrue { + hasAccepted = true + } + case gwv1.RouteConditionResolvedRefs: + if condition.Status == metav1.ConditionTrue { + hasResolved = true + } + } + } + if hasAccepted && hasResolved { + return true + } + } + return false +} + +func IsIncrementalUpgradeEnabled(spec *rayv1.RayServiceSpec) bool { + if !features.Enabled(features.RayServiceIncrementalUpgrade) { + return false + } + return spec != nil && spec.UpgradeStrategy != nil && + *spec.UpgradeStrategy.Type == rayv1.IncrementalUpgrade +} + +func GetRayServiceIncrementalUpgradeOptions(spec *rayv1.RayServiceSpec) *rayv1.IncrementalUpgradeOptions { + if spec != nil && spec.UpgradeStrategy != nil { + return spec.UpgradeStrategy.IncrementalUpgradeOptions + } + return nil +} + +// addGatewayListenersForRayService is a helper function to returns Gateway Listeners +func GetGatewayListenersForRayService(rayServiceInstance *rayv1.RayService) []gwv1.Listener { + listeners := make([]gwv1.Listener, 0, 1) + listenerName := fmt.Sprintf("%s-listener", rayServiceInstance.Name) + listener := gwv1.Listener{ + Name: gwv1.SectionName(listenerName), + Protocol: gwv1.HTTPProtocolType, // only support HTTP + Port: gwv1.PortNumber(int32(80)), // hardcoded to 80 for now + } + listeners = append(listeners, listener) + + return listeners +} + // Check where we are running. We are trying to distinguish here whether // this is vanilla kubernetes cluster or Openshift func GetClusterType() bool { diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index 851e37af3ea..bf762bfe42b 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -12,9 +12,11 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient" + "github.com/ray-project/kuberay/ray-operator/pkg/features" ) func TestGetClusterDomainName(t *testing.T) { @@ -1248,6 +1250,294 @@ func TestCalculateResources(t *testing.T) { } } +// helper function to return a Gateway object with GatewayStatus Conditions for testing. +func makeGatewayWithCondition(accepted bool) *gwv1.Gateway { + var conditions []metav1.Condition + if accepted { + conditions = []metav1.Condition{ + { + Type: string(gwv1.GatewayConditionAccepted), + Status: metav1.ConditionTrue, + }, + } + } + return &gwv1.Gateway{ + Status: gwv1.GatewayStatus{ + Conditions: conditions, + }, + } +} + +func TestIsGatewayReady(t *testing.T) { + tests := []struct { + gateway *gwv1.Gateway + name string + expected bool + }{ + { + name: "missing Gateway instance", + gateway: nil, + expected: false, + }, + { + name: "Gateway created but missing accepted condition", + gateway: makeGatewayWithCondition(false), + expected: false, + }, + { + name: "Gateway created with accepted condition", + gateway: makeGatewayWithCondition(true), + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, IsGatewayReady(tt.gateway)) + }) + } +} + +// helper function to return a HTTPRoute with HTTPRouteStatus for testing +func makeHTTPRouteWithParentRef( + parentRefName string, + namespace string, + accepted bool, + resolvedRefs bool, +) *gwv1.HTTPRoute { + var acceptedStatus, resolvedRefsStatus metav1.ConditionStatus + if accepted { + acceptedStatus = metav1.ConditionTrue + } else { + acceptedStatus = metav1.ConditionFalse + } + if resolvedRefs { + resolvedRefsStatus = metav1.ConditionTrue + } else { + resolvedRefsStatus = metav1.ConditionFalse + } + + return &gwv1.HTTPRoute{ + Status: gwv1.HTTPRouteStatus{ + RouteStatus: gwv1.RouteStatus{ + Parents: []gwv1.RouteParentStatus{ + { + ParentRef: gwv1.ParentReference{ + Name: gwv1.ObjectName(parentRefName), + Namespace: ptr.To(gwv1.Namespace(namespace)), + }, + Conditions: []metav1.Condition{ + { + Type: string(gwv1.RouteConditionAccepted), + Status: acceptedStatus, + }, + { + Type: string(gwv1.RouteConditionResolvedRefs), + Status: resolvedRefsStatus, + }, + }, + }, + }, + }, + }, + } +} + +func TestIsHTTPRouteReady(t *testing.T) { + gateway := &gwv1.Gateway{ + ObjectMeta: metav1.ObjectMeta{Name: "test-gateway", Namespace: "test-ns"}, + } + + tests := []struct { + httpRoute *gwv1.HTTPRoute + name string + expected bool + }{ + { + name: "missing HTTPRoute", + httpRoute: nil, + expected: false, + }, + { + name: "ParentRef does not match", + httpRoute: makeHTTPRouteWithParentRef("not-a-match", "other-test-ns", true, true), + expected: false, + }, + { + name: "matching ParentRef with Accepted condition but without ResolvedRefs", + httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", true, false), + expected: false, + }, + { + name: "matching ParentRef with ResolvedRefs but without Accepted", + httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", false, true), + expected: false, + }, + { + name: "ready HTTPRoute with all required conditions", + httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", true, true), + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, IsHTTPRouteReady(gateway, tt.httpRoute)) + }) + } +} + +func TestIsIncrementalUpgradeEnabled(t *testing.T) { + tests := []struct { + spec *rayv1.RayServiceSpec + name string + featureEnabled bool + expected bool + }{ + { + name: "missing UpgradeStrategy Type", + spec: &rayv1.RayServiceSpec{}, + featureEnabled: true, + expected: false, + }, + { + name: "UpgradeStrategy Type is IncrementalUpgrade but feature disabled", + spec: &rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + }, + }, + featureEnabled: false, + expected: false, + }, + { + name: "UpgradeStrategy Type is IncrementalUpgrade and feature enabled", + spec: &rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + }, + }, + featureEnabled: true, + expected: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, tc.featureEnabled) + assert.Equal(t, tc.expected, IsIncrementalUpgradeEnabled(tc.spec)) + }) + } +} + +func TestGetRayServiceIncrementalUpgradeOptions(t *testing.T) { + upgradeOptions := &rayv1.IncrementalUpgradeOptions{GatewayClassName: "gateway-class"} + + tests := []struct { + rayServiceSpec *rayv1.RayServiceSpec + expectedOptions *rayv1.IncrementalUpgradeOptions + name string + }{ + { + name: "RayServiceSpec is nil, return nil IncrementalUpgradeOptions", + rayServiceSpec: nil, + expectedOptions: nil, + }, + { + name: "UpgradeStrategy is nil, return nil IncrementalUpgradeOptions", + rayServiceSpec: &rayv1.RayServiceSpec{}, + expectedOptions: nil, + }, + { + name: "Valid IncrementalUpgradeOptions", + rayServiceSpec: &rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + IncrementalUpgradeOptions: upgradeOptions, + }, + }, + expectedOptions: upgradeOptions, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actualOptions := GetRayServiceIncrementalUpgradeOptions(tt.rayServiceSpec) + assert.Equal(t, tt.expectedOptions, actualOptions) + }) + } +} + +// func TestGetGatewayListenersForServeService(t *testing.T) { +// tests := []struct { +// name string +// serveService *corev1.Service +// expectedListeners []gwv1.Listener +// }{ +// { +// name: "Return listeners for empty Serve Service", +// serveService: &corev1.Service{}, +// expectedListeners: []gwv1.Listener{}, +// }, +// { +// name: "Return listener for valid Serve Service with single ports", +// serveService: &corev1.Service{ +// ObjectMeta: metav1.ObjectMeta{ +// Name: "serve-service", +// }, +// Spec: corev1.ServiceSpec{ +// Ports: []corev1.ServicePort{{Port: 8000}}, +// }, +// }, +// expectedListeners: []gwv1.Listener{ +// { +// Name: "serve-service-listener", +// Protocol: gwv1.HTTPProtocolType, +// Port: 8000, +// }, +// }, +// }, +// { +// name: "Return listeners for valid Serve Service with multiple ports", +// serveService: &corev1.Service{ +// ObjectMeta: metav1.ObjectMeta{ +// Name: "serve-service", +// }, +// Spec: corev1.ServiceSpec{ +// Ports: []corev1.ServicePort{ +// { +// Name: "default-port", +// Port: 8000, +// }, +// { +// Name: "some-other-port", +// Port: 8500, +// }, +// }, +// }, +// }, +// expectedListeners: []gwv1.Listener{ +// { +// Name: "serve-service-default-port-listener", +// Protocol: gwv1.HTTPProtocolType, +// Port: 8000, +// }, +// { +// Name: "serve-service-some-other-port-listener", +// Protocol: gwv1.HTTPProtocolType, +// Port: 8500, +// }, +// }, +// }, +// } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + listeners := GetGatewayListenersForServeService(tt.serveService) + assert.Equal(t, tt.expectedListeners, listeners) + }) + } +} + func TestGetContainerCommand(t *testing.T) { tests := []struct { name string diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 7e9097dd846..828b679ee70 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -306,12 +306,13 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { return fmt.Errorf("spec.rayClusterConfig.headGroupSpec.headService.metadata.name should not be set") } - // only NewCluster and None are valid upgradeType + // only IncrementalUpgrade, NewCluster, and None are valid upgradeType if rayService.Spec.UpgradeStrategy != nil && rayService.Spec.UpgradeStrategy.Type != nil && *rayService.Spec.UpgradeStrategy.Type != rayv1.None && - *rayService.Spec.UpgradeStrategy.Type != rayv1.NewCluster { - return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.NewCluster, rayv1.None) + *rayService.Spec.UpgradeStrategy.Type != rayv1.NewCluster && + *rayService.Spec.UpgradeStrategy.Type != rayv1.IncrementalUpgrade { + return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s, %s, or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.IncrementalUpgrade, rayv1.NewCluster, rayv1.None) } if rayService.Spec.RayClusterDeletionDelaySeconds != nil && @@ -319,6 +320,40 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { return fmt.Errorf("Spec.RayClusterDeletionDelaySeconds should be a non-negative integer, got %d", *rayService.Spec.RayClusterDeletionDelaySeconds) } + // If type is IncrementalUpgrade, validate the IncrementalUpgradeOptions + if IsIncrementalUpgradeEnabled(&rayService.Spec) { + return ValidateIncrementalUpgradeOptions(rayService) + } + + return nil +} + +func ValidateIncrementalUpgradeOptions(rayService *rayv1.RayService) error { + if !IsAutoscalingEnabled(&rayService.Spec.RayClusterSpec) { + return fmt.Errorf("Ray Autoscaler is required for IncrementalUpgrade") + } + + options := rayService.Spec.UpgradeStrategy.IncrementalUpgradeOptions + if options == nil { + return fmt.Errorf("IncrementalUpgradeOptions are required for IncrementalUpgrade") + } + + if options.MaxSurgePercent != nil && (*options.MaxSurgePercent < 0 || *options.MaxSurgePercent > 100) { + return fmt.Errorf("maxSurgePercent must be between 0 and 100") + } + + if options.StepSizePercent == nil || *options.StepSizePercent < 0 || *options.StepSizePercent > 100 { + return fmt.Errorf("stepSizePercent must be between 0 and 100") + } + + if options.IntervalSeconds == nil || *options.IntervalSeconds <= 0 { + return fmt.Errorf("intervalSeconds must be greater than 0") + } + + if options.GatewayClassName == "" { + return fmt.Errorf("gatewayClassName is required for IncrementalUpgrade") + } + return nil } diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index dbee9e612e7..4cefdda28b8 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1664,3 +1664,108 @@ func createBasicRayClusterSpec() *rayv1.RayClusterSpec { }, } } + +func TestValidateIncrementalUpgradeOptions(t *testing.T) { + tests := []struct { + maxSurgePercent *int32 + stepSizePercent *int32 + intervalSeconds *int32 + name string + gatewayClassName string + spec rayv1.RayServiceSpec + enableAutoscaling bool + expectError bool + }{ + { + name: "valid config", + maxSurgePercent: ptr.To(int32(50)), + stepSizePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(10)), + gatewayClassName: "istio", + enableAutoscaling: true, + expectError: false, + }, + { + name: "missing autoscaler", + stepSizePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(10)), + gatewayClassName: "istio", + enableAutoscaling: false, + expectError: true, + }, + { + name: "missing options", + enableAutoscaling: true, + expectError: true, + }, + { + name: "invalid MaxSurgePercent", + maxSurgePercent: ptr.To(int32(200)), + stepSizePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(10)), + gatewayClassName: "istio", + enableAutoscaling: true, + expectError: true, + }, + { + name: "missing StepSizePercent", + intervalSeconds: ptr.To(int32(10)), + gatewayClassName: "istio", + enableAutoscaling: true, + expectError: true, + }, + { + name: "invalid IntervalSeconds", + stepSizePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(0)), + gatewayClassName: "istio", + enableAutoscaling: true, + expectError: true, + }, + { + name: "missing GatewayClassName", + stepSizePercent: ptr.To(int32(50)), + intervalSeconds: ptr.To(int32(10)), + enableAutoscaling: true, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var upgradeStrategy *rayv1.RayServiceUpgradeStrategy + if tt.maxSurgePercent != nil || tt.stepSizePercent != nil || tt.intervalSeconds != nil || tt.gatewayClassName != "" { + upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + MaxSurgePercent: tt.maxSurgePercent, + StepSizePercent: tt.stepSizePercent, + IntervalSeconds: tt.intervalSeconds, + GatewayClassName: tt.gatewayClassName, + }, + } + } else if tt.expectError { + upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + } + } + + rayClusterSpec := *createBasicRayClusterSpec() + rayClusterSpec.EnableInTreeAutoscaling = ptr.To(tt.enableAutoscaling) + + rayService := &rayv1.RayService{ + Spec: rayv1.RayServiceSpec{ + RayClusterSpec: rayClusterSpec, + UpgradeStrategy: upgradeStrategy, + }, + } + + err := ValidateIncrementalUpgradeOptions(rayService) + if tt.expectError { + require.Error(t, err, tt.name) + } else { + require.NoError(t, err, tt.name) + } + }) + } +} diff --git a/ray-operator/main.go b/ray-operator/main.go index 5666a438733..897cc7f58e2 100644 --- a/ray-operator/main.go +++ b/ray-operator/main.go @@ -27,6 +27,7 @@ import ( k8szap "sigs.k8s.io/controller-runtime/pkg/log/zap" ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" configapi "github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" @@ -191,6 +192,10 @@ func main() { } features.LogFeatureGates(setupLog) + if features.Enabled(features.RayClusterStatusConditions) { + utilruntime.Must(gwv1.AddToScheme(scheme)) + } + // Manager options options := ctrl.Options{ Cache: cache.Options{ diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go new file mode 100644 index 00000000000..a736a964cdb --- /dev/null +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go @@ -0,0 +1,50 @@ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1 + +// IncrementalUpgradeOptionsApplyConfiguration represents a declarative configuration of the IncrementalUpgradeOptions type for use +// with apply. +type IncrementalUpgradeOptionsApplyConfiguration struct { + MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"` + StepSizePercent *int32 `json:"stepSizePercent,omitempty"` + IntervalSeconds *int32 `json:"intervalSeconds,omitempty"` + GatewayClassName *string `json:"gatewayClassName,omitempty"` +} + +// IncrementalUpgradeOptionsApplyConfiguration constructs a declarative configuration of the IncrementalUpgradeOptions type for use with +// apply. +func IncrementalUpgradeOptions() *IncrementalUpgradeOptionsApplyConfiguration { + return &IncrementalUpgradeOptionsApplyConfiguration{} +} + +// WithMaxSurgePercent sets the MaxSurgePercent field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the MaxSurgePercent field is set to the value of the last call. +func (b *IncrementalUpgradeOptionsApplyConfiguration) WithMaxSurgePercent(value int32) *IncrementalUpgradeOptionsApplyConfiguration { + b.MaxSurgePercent = &value + return b +} + +// WithStepSizePercent sets the StepSizePercent field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the StepSizePercent field is set to the value of the last call. +func (b *IncrementalUpgradeOptionsApplyConfiguration) WithStepSizePercent(value int32) *IncrementalUpgradeOptionsApplyConfiguration { + b.StepSizePercent = &value + return b +} + +// WithIntervalSeconds sets the IntervalSeconds field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the IntervalSeconds field is set to the value of the last call. +func (b *IncrementalUpgradeOptionsApplyConfiguration) WithIntervalSeconds(value int32) *IncrementalUpgradeOptionsApplyConfiguration { + b.IntervalSeconds = &value + return b +} + +// WithGatewayClassName sets the GatewayClassName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the GatewayClassName field is set to the value of the last call. +func (b *IncrementalUpgradeOptionsApplyConfiguration) WithGatewayClassName(value string) *IncrementalUpgradeOptionsApplyConfiguration { + b.GatewayClassName = &value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go index faef5106dfe..9426baa9b12 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go @@ -4,6 +4,7 @@ package v1 import ( corev1 "k8s.io/api/core/v1" + apisv1 "sigs.k8s.io/gateway-api/apis/v1" ) // RayServiceSpecApplyConfiguration represents a declarative configuration of the RayServiceSpec type for use @@ -13,6 +14,8 @@ type RayServiceSpecApplyConfiguration struct { ServiceUnhealthySecondThreshold *int32 `json:"serviceUnhealthySecondThreshold,omitempty"` DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"` ServeService *corev1.Service `json:"serveService,omitempty"` + Gateway *apisv1.Gateway `json:"gateway,omitempty"` + HTTPRoute *apisv1.HTTPRoute `json:"httpRoute,omitempty"` UpgradeStrategy *RayServiceUpgradeStrategyApplyConfiguration `json:"upgradeStrategy,omitempty"` ServeConfigV2 *string `json:"serveConfigV2,omitempty"` RayClusterSpec *RayClusterSpecApplyConfiguration `json:"rayClusterConfig,omitempty"` @@ -57,6 +60,22 @@ func (b *RayServiceSpecApplyConfiguration) WithServeService(value corev1.Service return b } +// WithGateway sets the Gateway field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Gateway field is set to the value of the last call. +func (b *RayServiceSpecApplyConfiguration) WithGateway(value apisv1.Gateway) *RayServiceSpecApplyConfiguration { + b.Gateway = &value + return b +} + +// WithHTTPRoute sets the HTTPRoute field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the HTTPRoute field is set to the value of the last call. +func (b *RayServiceSpecApplyConfiguration) WithHTTPRoute(value apisv1.HTTPRoute) *RayServiceSpecApplyConfiguration { + b.HTTPRoute = &value + return b +} + // WithUpgradeStrategy sets the UpgradeStrategy field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the UpgradeStrategy field is set to the value of the last call. diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go index b0fcd8032bb..2d7f2984cef 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go @@ -2,12 +2,19 @@ package v1 +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + // RayServiceStatusApplyConfiguration represents a declarative configuration of the RayServiceStatus type for use // with apply. type RayServiceStatusApplyConfiguration struct { - Applications map[string]AppStatusApplyConfiguration `json:"applicationStatuses,omitempty"` - RayClusterName *string `json:"rayClusterName,omitempty"` - RayClusterStatus *RayClusterStatusApplyConfiguration `json:"rayClusterStatus,omitempty"` + Applications map[string]AppStatusApplyConfiguration `json:"applicationStatuses,omitempty"` + TargetCapacity *int32 `json:"targetCapacity,omitempty"` + TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"` + LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"` + RayClusterName *string `json:"rayClusterName,omitempty"` + RayClusterStatus *RayClusterStatusApplyConfiguration `json:"rayClusterStatus,omitempty"` } // RayServiceStatusApplyConfiguration constructs a declarative configuration of the RayServiceStatus type for use with @@ -30,6 +37,30 @@ func (b *RayServiceStatusApplyConfiguration) WithApplications(entries map[string return b } +// WithTargetCapacity sets the TargetCapacity field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the TargetCapacity field is set to the value of the last call. +func (b *RayServiceStatusApplyConfiguration) WithTargetCapacity(value int32) *RayServiceStatusApplyConfiguration { + b.TargetCapacity = &value + return b +} + +// WithTrafficRoutedPercent sets the TrafficRoutedPercent field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the TrafficRoutedPercent field is set to the value of the last call. +func (b *RayServiceStatusApplyConfiguration) WithTrafficRoutedPercent(value int32) *RayServiceStatusApplyConfiguration { + b.TrafficRoutedPercent = &value + return b +} + +// WithLastTrafficMigratedTime sets the LastTrafficMigratedTime field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the LastTrafficMigratedTime field is set to the value of the last call. +func (b *RayServiceStatusApplyConfiguration) WithLastTrafficMigratedTime(value metav1.Time) *RayServiceStatusApplyConfiguration { + b.LastTrafficMigratedTime = &value + return b +} + // WithRayClusterName sets the RayClusterName field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the RayClusterName field is set to the value of the last call. diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go index 7d0da98387f..fa38154c22c 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go @@ -4,20 +4,20 @@ package v1 import ( rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" - apismetav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - metav1 "k8s.io/client-go/applyconfigurations/meta/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + applyconfigurationsmetav1 "k8s.io/client-go/applyconfigurations/meta/v1" ) // RayServiceStatusesApplyConfiguration represents a declarative configuration of the RayServiceStatuses type for use // with apply. type RayServiceStatusesApplyConfiguration struct { - Conditions []metav1.ConditionApplyConfiguration `json:"conditions,omitempty"` - LastUpdateTime *apismetav1.Time `json:"lastUpdateTime,omitempty"` - ServiceStatus *rayv1.ServiceStatus `json:"serviceStatus,omitempty"` - ActiveServiceStatus *RayServiceStatusApplyConfiguration `json:"activeServiceStatus,omitempty"` - PendingServiceStatus *RayServiceStatusApplyConfiguration `json:"pendingServiceStatus,omitempty"` - NumServeEndpoints *int32 `json:"numServeEndpoints,omitempty"` - ObservedGeneration *int64 `json:"observedGeneration,omitempty"` + LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"` + ServiceStatus *rayv1.ServiceStatus `json:"serviceStatus,omitempty"` + Conditions []applyconfigurationsmetav1.ConditionApplyConfiguration `json:"conditions,omitempty"` + ActiveServiceStatus *RayServiceStatusApplyConfiguration `json:"activeServiceStatus,omitempty"` + PendingServiceStatus *RayServiceStatusApplyConfiguration `json:"pendingServiceStatus,omitempty"` + ObservedGeneration *int64 `json:"observedGeneration,omitempty"` + NumServeEndpoints *int32 `json:"numServeEndpoints,omitempty"` } // RayServiceStatusesApplyConfiguration constructs a declarative configuration of the RayServiceStatuses type for use with @@ -26,23 +26,10 @@ func RayServiceStatuses() *RayServiceStatusesApplyConfiguration { return &RayServiceStatusesApplyConfiguration{} } -// WithConditions adds the given value to the Conditions field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the Conditions field. -func (b *RayServiceStatusesApplyConfiguration) WithConditions(values ...*metav1.ConditionApplyConfiguration) *RayServiceStatusesApplyConfiguration { - for i := range values { - if values[i] == nil { - panic("nil value passed to WithConditions") - } - b.Conditions = append(b.Conditions, *values[i]) - } - return b -} - // WithLastUpdateTime sets the LastUpdateTime field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the LastUpdateTime field is set to the value of the last call. -func (b *RayServiceStatusesApplyConfiguration) WithLastUpdateTime(value apismetav1.Time) *RayServiceStatusesApplyConfiguration { +func (b *RayServiceStatusesApplyConfiguration) WithLastUpdateTime(value metav1.Time) *RayServiceStatusesApplyConfiguration { b.LastUpdateTime = &value return b } @@ -55,6 +42,19 @@ func (b *RayServiceStatusesApplyConfiguration) WithServiceStatus(value rayv1.Ser return b } +// WithConditions adds the given value to the Conditions field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Conditions field. +func (b *RayServiceStatusesApplyConfiguration) WithConditions(values ...*applyconfigurationsmetav1.ConditionApplyConfiguration) *RayServiceStatusesApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithConditions") + } + b.Conditions = append(b.Conditions, *values[i]) + } + return b +} + // WithActiveServiceStatus sets the ActiveServiceStatus field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the ActiveServiceStatus field is set to the value of the last call. @@ -71,14 +71,6 @@ func (b *RayServiceStatusesApplyConfiguration) WithPendingServiceStatus(value *R return b } -// WithNumServeEndpoints sets the NumServeEndpoints field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the NumServeEndpoints field is set to the value of the last call. -func (b *RayServiceStatusesApplyConfiguration) WithNumServeEndpoints(value int32) *RayServiceStatusesApplyConfiguration { - b.NumServeEndpoints = &value - return b -} - // WithObservedGeneration sets the ObservedGeneration field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the ObservedGeneration field is set to the value of the last call. @@ -86,3 +78,11 @@ func (b *RayServiceStatusesApplyConfiguration) WithObservedGeneration(value int6 b.ObservedGeneration = &value return b } + +// WithNumServeEndpoints sets the NumServeEndpoints field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the NumServeEndpoints field is set to the value of the last call. +func (b *RayServiceStatusesApplyConfiguration) WithNumServeEndpoints(value int32) *RayServiceStatusesApplyConfiguration { + b.NumServeEndpoints = &value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go index 361a98f6ac9..0a190883bff 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go @@ -9,7 +9,8 @@ import ( // RayServiceUpgradeStrategyApplyConfiguration represents a declarative configuration of the RayServiceUpgradeStrategy type for use // with apply. type RayServiceUpgradeStrategyApplyConfiguration struct { - Type *rayv1.RayServiceUpgradeType `json:"type,omitempty"` + Type *rayv1.RayServiceUpgradeType `json:"type,omitempty"` + IncrementalUpgradeOptions *IncrementalUpgradeOptionsApplyConfiguration `json:"incrementalUpgradeOptions,omitempty"` } // RayServiceUpgradeStrategyApplyConfiguration constructs a declarative configuration of the RayServiceUpgradeStrategy type for use with @@ -25,3 +26,11 @@ func (b *RayServiceUpgradeStrategyApplyConfiguration) WithType(value rayv1.RaySe b.Type = &value return b } + +// WithIncrementalUpgradeOptions sets the IncrementalUpgradeOptions field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the IncrementalUpgradeOptions field is set to the value of the last call. +func (b *RayServiceUpgradeStrategyApplyConfiguration) WithIncrementalUpgradeOptions(value *IncrementalUpgradeOptionsApplyConfiguration) *RayServiceUpgradeStrategyApplyConfiguration { + b.IncrementalUpgradeOptions = value + return b +} diff --git a/ray-operator/pkg/client/applyconfiguration/utils.go b/ray-operator/pkg/client/applyconfiguration/utils.go index 050733b0c5e..62ba25bb755 100644 --- a/ray-operator/pkg/client/applyconfiguration/utils.go +++ b/ray-operator/pkg/client/applyconfiguration/utils.go @@ -34,6 +34,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &rayv1.HeadGroupSpecApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("HeadInfo"): return &rayv1.HeadInfoApplyConfiguration{} + case v1.SchemeGroupVersion.WithKind("IncrementalUpgradeOptions"): + return &rayv1.IncrementalUpgradeOptionsApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("RayCluster"): return &rayv1.RayClusterApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("RayClusterSpec"): diff --git a/ray-operator/pkg/features/features.go b/ray-operator/pkg/features/features.go index 2abea2ffbbb..ce5734cee0a 100644 --- a/ray-operator/pkg/features/features.go +++ b/ray-operator/pkg/features/features.go @@ -24,6 +24,13 @@ const ( // // Enables new deletion policy API in RayJob RayJobDeletionPolicy featuregate.Feature = "RayJobDeletionPolicy" + + // owner: @ryanaoleary + // rep: N/A + // alpha: v1.0 + // + // Enabled incremental upgrades for RayService zero-downtime upgrades. + RayServiceIncrementalUpgrade featuregate.Feature = "RayServiceIncrementalUpgrade" ) func init() { @@ -31,8 +38,9 @@ func init() { } var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{ - RayClusterStatusConditions: {Default: true, PreRelease: featuregate.Beta}, - RayJobDeletionPolicy: {Default: false, PreRelease: featuregate.Alpha}, + RayClusterStatusConditions: {Default: true, PreRelease: featuregate.Beta}, + RayJobDeletionPolicy: {Default: false, PreRelease: featuregate.Alpha}, + RayServiceIncrementalUpgrade: {Default: false, PreRelease: featuregate.Alpha}, } // SetFeatureGateDuringTest is a helper method to override feature gates in tests. diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go new file mode 100644 index 00000000000..6bfefa1f18f --- /dev/null +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -0,0 +1,157 @@ +package e2eincrementalupgrade + +import ( + "fmt" + "strings" + "testing" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + + rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" + "github.com/ray-project/kuberay/ray-operator/pkg/features" + "github.com/ray-project/kuberay/ray-operator/test/sampleyaml" + . "github.com/ray-project/kuberay/ray-operator/test/support" +) + +// helper function to get RayCluster head service external IP to use to poll the RayService +func GetHeadServiceExternalIP(t *testing.T, clusterName, namespace string) (string, error) { + test := With(t) + + svc, err := test.Client().Core().CoreV1().Services(namespace).Get(test.Ctx(), clusterName+"-head-svc", metav1.GetOptions{}) + if err != nil { + return "", err + } + if len(svc.Status.LoadBalancer.Ingress) == 0 { + return "", fmt.Errorf("no ingress for service %s", svc.Name) + } + return svc.Status.LoadBalancer.Ingress[0].IP, nil +} + +func TestRayServiceIncrementalUpgrade(t *testing.T) { + features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true) + + test := With(t) + g := NewWithT(t) + + namespace := test.NewTestNamespace() + rayServiceName := "incremental-rayservice" + + // Create a RayService with IncrementalUpgrade enabled + stepSize := ptr.To(int32(20)) + interval := ptr.To(int32(30)) + maxSurge := ptr.To(int32(10)) + + rayServiceAC := rayv1ac.RayService(rayServiceName, namespace.Name). + WithSpec(IncrementalUpgradeRayServiceApplyConfiguration(stepSize, interval, maxSurge)) + rayService, err := test.Client().Ray().RayV1().RayServices(namespace.Name).Apply(test.Ctx(), rayServiceAC, TestApplyOptions) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(rayService).NotTo(BeNil()) + + LogWithTimestamp(test.T(), "Waiting for RayService %s/%s to be ready", rayService.Namespace, rayService.Name) + g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutMedium). + Should(WithTransform(IsRayServiceReady, BeTrue())) + + rayService, err = GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + + // Validate Gateway and HTTPRoute objects have been created for incremental upgrade. + gateway, err := GetGateway(test, namespace.Name, fmt.Sprintf("%s-%s", rayServiceName, "gateway")) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(gateway).NotTo(BeNil()) + + httpRoute, err := GetHTTPRoute(test, namespace.Name, fmt.Sprintf("%s-%s", "httproute", rayServiceName)) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(httpRoute).NotTo(BeNil()) + + // Create curl pod to test traffic routing through Gateway to RayService + curlPodName := "curl-pod" + curlContainerName := "curl-container" + curlPod, err := CreateCurlPod(test, curlPodName, curlContainerName, namespace.Name) + g.Expect(err).NotTo(HaveOccurred()) + + g.Eventually(func(g Gomega) *corev1.Pod { + updatedPod, err := test.Client().Core().CoreV1().Pods(curlPod.Namespace).Get(test.Ctx(), curlPod.Name, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred()) + return updatedPod + }, TestTimeoutShort).Should(WithTransform(sampleyaml.IsPodRunningAndReady, BeTrue())) + + // Get the Gateway endpoint to send requests to + gatewayIP := GetGatewayIP(gateway) + g.Expect(gatewayIP).NotTo(BeEmpty()) + + LogWithTimestamp(test.T(), "Verifying RayService is serving traffic") + stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + g.Expect(stdout.String()).To(Equal("6")) + stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/calc", `["MUL", 3]`) + g.Expect(stdout.String()).To(Equal("15 pizzas please!")) + + // Trigger incremental upgrade by updating RayService serve config + rayService, err = GetRayService(test, namespace.Name, rayService.Name) + g.Expect(err).NotTo(HaveOccurred()) + + serveConfig := rayService.Spec.ServeConfigV2 + serveConfig = strings.Replace(serveConfig, "price: 3", "price: 4", -1) + serveConfig = strings.Replace(serveConfig, "factor: 5", "factor: 3", -1) + rayService.Spec.ServeConfigV2 = serveConfig + _, err = test.Client().Ray().RayV1().RayServices(namespace.Name).Update( + test.Ctx(), + rayService, + metav1.UpdateOptions{}, + ) + g.Expect(err).NotTo(HaveOccurred()) + + // Check that upgrade steps incrementally with traffic/capacity split between clusters + g.Eventually(func(g Gomega) { + rayService, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + + g.Expect(rayService.Status.PendingServiceStatus).NotTo(BeNil()) + g.Expect(rayService.Status.PendingServiceStatus.TrafficRoutedPercent).NotTo(BeNil()) + g.Expect(rayService.Status.PendingServiceStatus.TargetCapacity).NotTo(BeNil()) + g.Expect(rayService.Status.ActiveServiceStatus).NotTo(BeNil()) + g.Expect(rayService.Status.ActiveServiceStatus.TrafficRoutedPercent).NotTo(BeNil()) + g.Expect(rayService.Status.ActiveServiceStatus.TargetCapacity).NotTo(BeNil()) + + for _, val := range []int32{ + *rayService.Status.PendingServiceStatus.TrafficRoutedPercent, + *rayService.Status.ActiveServiceStatus.TrafficRoutedPercent, + *rayService.Status.PendingServiceStatus.TargetCapacity, + *rayService.Status.ActiveServiceStatus.TargetCapacity, + } { + g.Expect(val).To(BeNumerically(">", 0)) + g.Expect(val).To(BeNumerically("<", 100)) + } + }, TestTimeoutMedium).Should(Succeed()) + + // Validate that traffic is split across old and new clusters of the RayService + g.Eventually(func(g Gomega) { + rayService, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + + activeSvcName := rayService.Status.ActiveServiceStatus.RayClusterStatus.Head.ServiceName + pendingSvcName := rayService.Status.PendingServiceStatus.RayClusterStatus.Head.ServiceName + + activeResp, _ := CurlRayServiceHeadService( + test, activeSvcName, rayService, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + pendingResp, _ := CurlRayServiceHeadService( + test, pendingSvcName, rayService, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + + // Both clusters should still be serving traffic during the split + g.Expect(activeResp.String()).To(Equal("6")) + g.Expect(pendingResp.String()).To(Equal("6")) + }, TestTimeoutMedium).Should(Succeed()) + + // Validate incremental upgrade completes + g.Eventually(func(g Gomega) { + rayService, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + + g.Expect(rayService.Status.PendingServiceStatus.TrafficRoutedPercent).To(Equal(ptr.To(int32(100)))) + g.Expect(rayService.Status.ActiveServiceStatus.TrafficRoutedPercent).To(Equal(ptr.To(int32(0)))) + g.Expect(rayService.Status.PendingServiceStatus.TargetCapacity).To(Equal(ptr.To(int32(100)))) + g.Expect(rayService.Status.ActiveServiceStatus.TargetCapacity).To(Equal(ptr.To(int32(0)))) + }, TestTimeoutMedium).Should(Succeed()) +} diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go new file mode 100644 index 00000000000..ba09a6822a3 --- /dev/null +++ b/ray-operator/test/e2eincrementalupgrade/support.go @@ -0,0 +1,88 @@ +package e2eincrementalupgrade + +import ( + "bytes" + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/utils/ptr" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" + + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" + e2eRayService "github.com/ray-project/kuberay/ray-operator/test/e2erayservice" + . "github.com/ray-project/kuberay/ray-operator/test/support" +) + +func CurlRayServiceHeadService( + t Test, + headSvcName string, + rayService *rayv1.RayService, + curlPod *corev1.Pod, + curlPodContainerName, + rayServicePath, + body string, +) (bytes.Buffer, bytes.Buffer) { + cmd := []string{ + "curl", + "-X", "POST", + "-H", "Content-Type: application/json", + fmt.Sprintf("%s.%s.svc.cluster.local:8000%s", headSvcName, rayService.Namespace, rayServicePath), + "-d", body, + } + + return ExecPodCmd(t, curlPod, curlPodContainerName, cmd) +} + +func CurlRayServiceGateway( + t Test, + gatewayIP string, + curlPod *corev1.Pod, + curlPodContainerName, + rayServicePath, + body string, +) (bytes.Buffer, bytes.Buffer) { + cmd := []string{ + "curl", + "-X", "POST", + "-H", "Content-Type: application/json", + fmt.Sprintf("%s:80%s", gatewayIP, rayServicePath), + "-d", body, + } + + return ExecPodCmd(t, curlPod, curlPodContainerName, cmd) +} + +func IncrementalUpgradeRayServiceApplyConfiguration( + stepSizePercent, intervalSeconds, maxSurgePercent *int32, +) *rayv1ac.RayServiceSpecApplyConfiguration { + spec := e2eRayService.RayServiceSampleYamlApplyConfiguration() + + spec.RayClusterSpec.EnableInTreeAutoscaling = ptr.To(true) + spec.WithUpgradeStrategy(rayv1ac.RayServiceUpgradeStrategy(). + WithType(rayv1.IncrementalUpgrade). + WithIncrementalUpgradeOptions( + rayv1ac.IncrementalUpgradeOptions(). + WithGatewayClassName("istio"). + WithStepSizePercent(*stepSizePercent). + WithIntervalSeconds(*intervalSeconds). + WithMaxSurgePercent(*maxSurgePercent), + ), + ) + + return spec +} + +// GetGatewayIP retrieves the external IP for a Gateway object +func GetGatewayIP(gateway *gwv1.Gateway) string { + if gateway == nil { + return "" + } + for _, addr := range gateway.Status.Addresses { + if addr.Type == nil || *addr.Type == gwv1.IPAddressType { + return addr.Value + } + } + + return "" +} diff --git a/ray-operator/test/support/client.go b/ray-operator/test/support/client.go index 2e313483966..4925184d46b 100644 --- a/ray-operator/test/support/client.go +++ b/ray-operator/test/support/client.go @@ -8,6 +8,7 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" + gatewayclient "sigs.k8s.io/gateway-api/pkg/client/clientset/versioned" rayclient "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned" ) @@ -17,6 +18,7 @@ type Client interface { Ray() rayclient.Interface Dynamic() dynamic.Interface Config() rest.Config + Gateway() gatewayclient.Interface } type testClient struct { @@ -24,6 +26,7 @@ type testClient struct { ray rayclient.Interface dynamic dynamic.Interface config rest.Config + gateway gatewayclient.Interface } var _ Client = (*testClient)(nil) @@ -44,6 +47,10 @@ func (t *testClient) Config() rest.Config { return t.config } +func (t *testClient) Gateway() gatewayclient.Interface { + return t.gateway +} + func newTestClient() (Client, error) { cfg, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( clientcmd.NewDefaultClientConfigLoadingRules(), @@ -68,10 +75,16 @@ func newTestClient() (Client, error) { return nil, err } + gatewayClient, err := gatewayclient.NewForConfig(cfg) + if err != nil { + return nil, err + } + return &testClient{ core: kubeClient, ray: rayClient, dynamic: dynamicClient, config: *cfg, + gateway: gatewayClient, }, nil } diff --git a/ray-operator/test/support/ray.go b/ray-operator/test/support/ray.go index ffea3c75d87..0b5c525abcf 100644 --- a/ray-operator/test/support/ray.go +++ b/ray-operator/test/support/ray.go @@ -9,6 +9,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" @@ -226,3 +227,23 @@ func GetRayClusterWorkerGroupReplicaSum(cluster *rayv1.RayCluster) int32 { } return replicas } + +func GetHTTPRoute(t Test, namespace, name string) (*gwv1.HTTPRoute, error) { + return t.Client().Gateway().GatewayV1().HTTPRoutes(namespace).Get(t.Ctx(), name, metav1.GetOptions{}) +} + +func HTTPRoute(t Test, namespace, name string) func() (*gwv1.HTTPRoute, error) { + return func() (*gwv1.HTTPRoute, error) { + return GetHTTPRoute(t, namespace, name) + } +} + +func GetGateway(t Test, namespace, name string) (*gwv1.Gateway, error) { + return t.Client().Gateway().GatewayV1().Gateways(namespace).Get(t.Ctx(), name, metav1.GetOptions{}) +} + +func Gateway(t Test, namespace, name string) func() (*gwv1.Gateway, error) { + return func() (*gwv1.Gateway, error) { + return GetGateway(t, namespace, name) + } +} From 6e880ffc1d26b2b0986311c2c24bfdf2a3ca78a9 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 4 Jun 2025 03:17:04 +0000 Subject: [PATCH 02/56] Fix some tests and create Gateway for pending cluster Signed-off-by: Ryan O'Leary --- docs/reference/api.md | 4 +- .../crds/ray.io_rayservices.yaml | 1759 +---------------- ray-operator/apis/ray/v1/rayservice_types.go | 11 +- .../apis/ray/v1/zz_generated.deepcopy.go | 11 - .../config/crd/bases/ray.io_rayservices.yaml | 1759 +---------------- .../controllers/ray/common/association.go | 16 +- .../controllers/ray/rayservice_controller.go | 45 +- .../ray/rayservice_controller_unit_test.go | 158 +- ray-operator/controllers/ray/utils/util.go | 12 +- .../controllers/ray/utils/util_test.go | 34 +- .../ray/v1/rayservicespec.go | 37 +- .../rayservice_incremental_upgrade_test.go | 14 +- .../test/e2eincrementalupgrade/support.go | 124 +- 13 files changed, 321 insertions(+), 3663 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index b367e562ea6..2706d2f2a87 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -374,10 +374,10 @@ _Appears in:_ | `serviceUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | | | `deploymentUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | | | `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | | -| `gateway` _[Gateway](#gateway)_ | Gateway is the Gateway object for the RayService to serve traffics during an IncrementalUpgrade. | | | -| `httpRoute` _[HTTPRoute](#httproute)_ | HTTPRoute is the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade. | | | | `upgradeStrategy` _[RayServiceUpgradeStrategy](#rayserviceupgradestrategy)_ | UpgradeStrategy defines the scaling policy used when upgrading the RayService. | | | | `serveConfigV2` _string_ | Important: Run "make" to regenerate code after modifying this file
Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. | | | +| `gateway` _string_ | Gateway is the name of the Gateway object for the RayService to serve traffics during an IncrementalUpgrade. | | | +| `httpRoute` _string_ | HTTPRoute is the name of the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade. | | | | `rayClusterConfig` _[RayClusterSpec](#rayclusterspec)_ | | | | | `excludeHeadPodFromServeSvc` _boolean_ | If the field is set to true, the value of the label `ray.io/serve` on the head Pod should always be false.
Therefore, the head Pod's endpoint will not be added to the Kubernetes Serve service. | | | diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index 02e449d4726..73e38364781 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -41,1764 +41,9 @@ spec: excludeHeadPodFromServeSvc: type: boolean gateway: - properties: - apiVersion: - type: string - kind: - type: string - metadata: - properties: - annotations: - additionalProperties: - type: string - type: object - finalizers: - items: - type: string - type: array - labels: - additionalProperties: - type: string - type: object - name: - type: string - namespace: - type: string - type: object - spec: - properties: - addresses: - items: - properties: - type: - default: IPAddress - maxLength: 253 - minLength: 1 - pattern: ^Hostname|IPAddress|NamedAddress|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ - type: string - value: - maxLength: 253 - minLength: 1 - type: string - required: - - value - type: object - x-kubernetes-validations: - - message: Hostname value must only contain valid characters - (matching ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$) - rule: 'self.type == ''Hostname'' ? self.value.matches(r"""^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"""): - true' - maxItems: 16 - type: array - x-kubernetes-validations: - - message: IPAddress values must be unique - rule: 'self.all(a1, a1.type == ''IPAddress'' ? self.exists_one(a2, - a2.type == a1.type && a2.value == a1.value) : true )' - - message: Hostname values must be unique - rule: 'self.all(a1, a1.type == ''Hostname'' ? self.exists_one(a2, - a2.type == a1.type && a2.value == a1.value) : true )' - backendTLS: - properties: - clientCertificateRef: - properties: - group: - default: "" - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Secret - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - required: - - name - type: object - type: object - gatewayClassName: - maxLength: 253 - minLength: 1 - type: string - infrastructure: - properties: - annotations: - additionalProperties: - maxLength: 4096 - minLength: 0 - type: string - maxProperties: 8 - type: object - x-kubernetes-validations: - - message: Annotation keys must be in the form of an optional - DNS subdomain prefix followed by a required name segment - of up to 63 characters. - rule: self.all(key, key.matches(r"""^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$""")) - - message: If specified, the annotation key's prefix must - be a DNS subdomain not longer than 253 characters - in total. - rule: self.all(key, key.split("/")[0].size() < 253) - labels: - additionalProperties: - maxLength: 63 - minLength: 0 - pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ - type: string - maxProperties: 8 - type: object - x-kubernetes-validations: - - message: Label keys must be in the form of an optional - DNS subdomain prefix followed by a required name segment - of up to 63 characters. - rule: self.all(key, key.matches(r"""^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$""")) - - message: If specified, the label key's prefix must be - a DNS subdomain not longer than 253 characters in - total. - rule: self.all(key, key.split("/")[0].size() < 253) - parametersRef: - properties: - group: - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - required: - - group - - kind - - name - type: object - type: object - listeners: - items: - properties: - allowedRoutes: - default: - namespaces: - from: Same - properties: - kinds: - items: - properties: - group: - default: gateway.networking.k8s.io - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - required: - - kind - type: object - maxItems: 8 - type: array - namespaces: - default: - from: Same - properties: - from: - default: Same - enum: - - All - - Selector - - Same - type: string - selector: - properties: - matchExpressions: - items: - properties: - key: - type: string - operator: - type: string - values: - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - type: object - type: object - x-kubernetes-map-type: atomic - type: object - type: object - hostname: - maxLength: 253 - minLength: 1 - pattern: ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - name: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - protocol: - maxLength: 255 - minLength: 1 - pattern: ^[a-zA-Z0-9]([-a-zA-Z0-9]*[a-zA-Z0-9])?$|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9]+$ - type: string - tls: - properties: - certificateRefs: - items: - properties: - group: - default: "" - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Secret - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - required: - - name - type: object - maxItems: 64 - type: array - frontendValidation: - properties: - caCertificateRefs: - items: - properties: - group: - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - required: - - group - - kind - - name - type: object - maxItems: 8 - minItems: 1 - type: array - type: object - mode: - default: Terminate - enum: - - Terminate - - Passthrough - type: string - options: - additionalProperties: - maxLength: 4096 - minLength: 0 - type: string - maxProperties: 16 - type: object - type: object - x-kubernetes-validations: - - message: certificateRefs or options must be specified - when mode is Terminate - rule: 'self.mode == ''Terminate'' ? size(self.certificateRefs) - > 0 || size(self.options) > 0 : true' - required: - - name - - port - - protocol - type: object - maxItems: 64 - minItems: 1 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - x-kubernetes-validations: - - message: tls must not be specified for protocols ['HTTP', - 'TCP', 'UDP'] - rule: 'self.all(l, l.protocol in [''HTTP'', ''TCP'', ''UDP''] - ? !has(l.tls) : true)' - - message: tls mode must be Terminate for protocol HTTPS - rule: 'self.all(l, (l.protocol == ''HTTPS'' && has(l.tls)) - ? (l.tls.mode == '''' || l.tls.mode == ''Terminate'') - : true)' - - message: hostname must not be specified for protocols ['TCP', - 'UDP'] - rule: 'self.all(l, l.protocol in [''TCP'', ''UDP''] ? (!has(l.hostname) - || l.hostname == '''') : true)' - - message: Listener name must be unique within the Gateway - rule: self.all(l1, self.exists_one(l2, l1.name == l2.name)) - - message: Combination of port, protocol and hostname must - be unique for each listener - rule: 'self.all(l1, self.exists_one(l2, l1.port == l2.port - && l1.protocol == l2.protocol && (has(l1.hostname) && - has(l2.hostname) ? l1.hostname == l2.hostname : !has(l1.hostname) - && !has(l2.hostname))))' - required: - - gatewayClassName - - listeners - type: object - status: - default: - conditions: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Accepted - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Programmed - properties: - addresses: - items: - properties: - type: - default: IPAddress - maxLength: 253 - minLength: 1 - pattern: ^Hostname|IPAddress|NamedAddress|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ - type: string - value: - maxLength: 253 - minLength: 1 - type: string - required: - - value - type: object - x-kubernetes-validations: - - message: Hostname value must only contain valid characters - (matching ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$) - rule: 'self.type == ''Hostname'' ? self.value.matches(r"""^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"""): - true' - maxItems: 16 - type: array - conditions: - default: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Accepted - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Programmed - items: - properties: - lastTransitionTime: - format: date-time - type: string - message: - maxLength: 32768 - type: string - observedGeneration: - format: int64 - minimum: 0 - type: integer - reason: - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - enum: - - "True" - - "False" - - Unknown - type: string - type: - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - listeners: - items: - properties: - attachedRoutes: - format: int32 - type: integer - conditions: - items: - properties: - lastTransitionTime: - format: date-time - type: string - message: - maxLength: 32768 - type: string - observedGeneration: - format: int64 - minimum: 0 - type: integer - reason: - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - enum: - - "True" - - "False" - - Unknown - type: string - type: - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - name: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - supportedKinds: - items: - properties: - group: - default: gateway.networking.k8s.io - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - required: - - kind - type: object - maxItems: 8 - type: array - required: - - attachedRoutes - - conditions - - name - - supportedKinds - type: object - maxItems: 64 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - required: - - spec - type: object + type: string httpRoute: - properties: - apiVersion: - type: string - kind: - type: string - metadata: - properties: - annotations: - additionalProperties: - type: string - type: object - finalizers: - items: - type: string - type: array - labels: - additionalProperties: - type: string - type: object - name: - type: string - namespace: - type: string - type: object - spec: - properties: - hostnames: - items: - maxLength: 253 - minLength: 1 - pattern: ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - maxItems: 16 - type: array - parentRefs: - items: - properties: - group: - default: gateway.networking.k8s.io - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Gateway - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - sectionName: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - required: - - name - type: object - maxItems: 32 - type: array - rules: - default: - - matches: - - path: - type: PathPrefix - value: / - items: - properties: - backendRefs: - items: - properties: - filters: - items: - properties: - extensionRef: - properties: - group: - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - required: - - group - - kind - - name - type: object - requestHeaderModifier: - properties: - add: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - remove: - items: - type: string - maxItems: 16 - type: array - x-kubernetes-list-type: set - set: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - requestMirror: - properties: - backendRef: - properties: - group: - default: "" - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Service - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - required: - - name - type: object - x-kubernetes-validations: - - message: Must have port for Service - reference - rule: '(size(self.group) == 0 && self.kind - == ''Service'') ? has(self.port) - : true' - fraction: - properties: - denominator: - default: 100 - format: int32 - minimum: 1 - type: integer - numerator: - format: int32 - minimum: 0 - type: integer - required: - - numerator - type: object - x-kubernetes-validations: - - message: numerator must be less than - or equal to denominator - rule: self.numerator <= self.denominator - percent: - format: int32 - maximum: 100 - minimum: 0 - type: integer - required: - - backendRef - type: object - requestRedirect: - properties: - hostname: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - path: - properties: - replaceFullPath: - maxLength: 1024 - type: string - replacePrefixMatch: - maxLength: 1024 - type: string - type: - enum: - - ReplaceFullPath - - ReplacePrefixMatch - type: string - required: - - type - type: object - x-kubernetes-validations: - - message: replaceFullPath must be specified - when type is set to 'ReplaceFullPath' - rule: 'self.type == ''ReplaceFullPath'' - ? has(self.replaceFullPath) : true' - - message: type must be 'ReplaceFullPath' - when replaceFullPath is set - rule: 'has(self.replaceFullPath) ? - self.type == ''ReplaceFullPath'' - : true' - - message: replacePrefixMatch must be - specified when type is set to 'ReplacePrefixMatch' - rule: 'self.type == ''ReplacePrefixMatch'' - ? has(self.replacePrefixMatch) : - true' - - message: type must be 'ReplacePrefixMatch' - when replacePrefixMatch is set - rule: 'has(self.replacePrefixMatch) - ? self.type == ''ReplacePrefixMatch'' - : true' - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - scheme: - enum: - - http - - https - type: string - statusCode: - default: 302 - enum: - - 301 - - 302 - type: integer - type: object - responseHeaderModifier: - properties: - add: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - remove: - items: - type: string - maxItems: 16 - type: array - x-kubernetes-list-type: set - set: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - type: - enum: - - RequestHeaderModifier - - ResponseHeaderModifier - - RequestMirror - - RequestRedirect - - URLRewrite - - ExtensionRef - type: string - urlRewrite: - properties: - hostname: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - path: - properties: - replaceFullPath: - maxLength: 1024 - type: string - replacePrefixMatch: - maxLength: 1024 - type: string - type: - enum: - - ReplaceFullPath - - ReplacePrefixMatch - type: string - required: - - type - type: object - x-kubernetes-validations: - - message: replaceFullPath must be specified - when type is set to 'ReplaceFullPath' - rule: 'self.type == ''ReplaceFullPath'' - ? has(self.replaceFullPath) : true' - - message: type must be 'ReplaceFullPath' - when replaceFullPath is set - rule: 'has(self.replaceFullPath) ? - self.type == ''ReplaceFullPath'' - : true' - - message: replacePrefixMatch must be - specified when type is set to 'ReplacePrefixMatch' - rule: 'self.type == ''ReplacePrefixMatch'' - ? has(self.replacePrefixMatch) : - true' - - message: type must be 'ReplacePrefixMatch' - when replacePrefixMatch is set - rule: 'has(self.replacePrefixMatch) - ? self.type == ''ReplacePrefixMatch'' - : true' - type: object - required: - - type - type: object - x-kubernetes-validations: - - message: filter.requestHeaderModifier must - be nil if the filter.type is not RequestHeaderModifier - rule: '!(has(self.requestHeaderModifier) && - self.type != ''RequestHeaderModifier'')' - - message: filter.requestHeaderModifier must - be specified for RequestHeaderModifier filter.type - rule: '!(!has(self.requestHeaderModifier) - && self.type == ''RequestHeaderModifier'')' - - message: filter.responseHeaderModifier must - be nil if the filter.type is not ResponseHeaderModifier - rule: '!(has(self.responseHeaderModifier) - && self.type != ''ResponseHeaderModifier'')' - - message: filter.responseHeaderModifier must - be specified for ResponseHeaderModifier - filter.type - rule: '!(!has(self.responseHeaderModifier) - && self.type == ''ResponseHeaderModifier'')' - - message: filter.requestMirror must be nil - if the filter.type is not RequestMirror - rule: '!(has(self.requestMirror) && self.type - != ''RequestMirror'')' - - message: filter.requestMirror must be specified - for RequestMirror filter.type - rule: '!(!has(self.requestMirror) && self.type - == ''RequestMirror'')' - - message: filter.requestRedirect must be nil - if the filter.type is not RequestRedirect - rule: '!(has(self.requestRedirect) && self.type - != ''RequestRedirect'')' - - message: filter.requestRedirect must be specified - for RequestRedirect filter.type - rule: '!(!has(self.requestRedirect) && self.type - == ''RequestRedirect'')' - - message: filter.urlRewrite must be nil if - the filter.type is not URLRewrite - rule: '!(has(self.urlRewrite) && self.type - != ''URLRewrite'')' - - message: filter.urlRewrite must be specified - for URLRewrite filter.type - rule: '!(!has(self.urlRewrite) && self.type - == ''URLRewrite'')' - - message: filter.extensionRef must be nil if - the filter.type is not ExtensionRef - rule: '!(has(self.extensionRef) && self.type - != ''ExtensionRef'')' - - message: filter.extensionRef must be specified - for ExtensionRef filter.type - rule: '!(!has(self.extensionRef) && self.type - == ''ExtensionRef'')' - maxItems: 16 - type: array - x-kubernetes-validations: - - message: May specify either httpRouteFilterRequestRedirect - or httpRouteFilterRequestRewrite, but not - both - rule: '!(self.exists(f, f.type == ''RequestRedirect'') - && self.exists(f, f.type == ''URLRewrite''))' - - message: May specify either httpRouteFilterRequestRedirect - or httpRouteFilterRequestRewrite, but not - both - rule: '!(self.exists(f, f.type == ''RequestRedirect'') - && self.exists(f, f.type == ''URLRewrite''))' - - message: RequestHeaderModifier filter cannot - be repeated - rule: self.filter(f, f.type == 'RequestHeaderModifier').size() - <= 1 - - message: ResponseHeaderModifier filter cannot - be repeated - rule: self.filter(f, f.type == 'ResponseHeaderModifier').size() - <= 1 - - message: RequestRedirect filter cannot be repeated - rule: self.filter(f, f.type == 'RequestRedirect').size() - <= 1 - - message: URLRewrite filter cannot be repeated - rule: self.filter(f, f.type == 'URLRewrite').size() - <= 1 - group: - default: "" - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Service - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - weight: - default: 1 - format: int32 - maximum: 1000000 - minimum: 0 - type: integer - required: - - name - type: object - x-kubernetes-validations: - - message: Must have port for Service reference - rule: '(size(self.group) == 0 && self.kind == ''Service'') - ? has(self.port) : true' - maxItems: 16 - type: array - filters: - items: - properties: - extensionRef: - properties: - group: - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - required: - - group - - kind - - name - type: object - requestHeaderModifier: - properties: - add: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - remove: - items: - type: string - maxItems: 16 - type: array - x-kubernetes-list-type: set - set: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - requestMirror: - properties: - backendRef: - properties: - group: - default: "" - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Service - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - required: - - name - type: object - x-kubernetes-validations: - - message: Must have port for Service reference - rule: '(size(self.group) == 0 && self.kind - == ''Service'') ? has(self.port) : true' - fraction: - properties: - denominator: - default: 100 - format: int32 - minimum: 1 - type: integer - numerator: - format: int32 - minimum: 0 - type: integer - required: - - numerator - type: object - x-kubernetes-validations: - - message: numerator must be less than or - equal to denominator - rule: self.numerator <= self.denominator - percent: - format: int32 - maximum: 100 - minimum: 0 - type: integer - required: - - backendRef - type: object - requestRedirect: - properties: - hostname: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - path: - properties: - replaceFullPath: - maxLength: 1024 - type: string - replacePrefixMatch: - maxLength: 1024 - type: string - type: - enum: - - ReplaceFullPath - - ReplacePrefixMatch - type: string - required: - - type - type: object - x-kubernetes-validations: - - message: replaceFullPath must be specified - when type is set to 'ReplaceFullPath' - rule: 'self.type == ''ReplaceFullPath'' - ? has(self.replaceFullPath) : true' - - message: type must be 'ReplaceFullPath' - when replaceFullPath is set - rule: 'has(self.replaceFullPath) ? self.type - == ''ReplaceFullPath'' : true' - - message: replacePrefixMatch must be specified - when type is set to 'ReplacePrefixMatch' - rule: 'self.type == ''ReplacePrefixMatch'' - ? has(self.replacePrefixMatch) : true' - - message: type must be 'ReplacePrefixMatch' - when replacePrefixMatch is set - rule: 'has(self.replacePrefixMatch) ? self.type - == ''ReplacePrefixMatch'' : true' - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - scheme: - enum: - - http - - https - type: string - statusCode: - default: 302 - enum: - - 301 - - 302 - type: integer - type: object - responseHeaderModifier: - properties: - add: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - remove: - items: - type: string - maxItems: 16 - type: array - x-kubernetes-list-type: set - set: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - type: - enum: - - RequestHeaderModifier - - ResponseHeaderModifier - - RequestMirror - - RequestRedirect - - URLRewrite - - ExtensionRef - type: string - urlRewrite: - properties: - hostname: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - path: - properties: - replaceFullPath: - maxLength: 1024 - type: string - replacePrefixMatch: - maxLength: 1024 - type: string - type: - enum: - - ReplaceFullPath - - ReplacePrefixMatch - type: string - required: - - type - type: object - x-kubernetes-validations: - - message: replaceFullPath must be specified - when type is set to 'ReplaceFullPath' - rule: 'self.type == ''ReplaceFullPath'' - ? has(self.replaceFullPath) : true' - - message: type must be 'ReplaceFullPath' - when replaceFullPath is set - rule: 'has(self.replaceFullPath) ? self.type - == ''ReplaceFullPath'' : true' - - message: replacePrefixMatch must be specified - when type is set to 'ReplacePrefixMatch' - rule: 'self.type == ''ReplacePrefixMatch'' - ? has(self.replacePrefixMatch) : true' - - message: type must be 'ReplacePrefixMatch' - when replacePrefixMatch is set - rule: 'has(self.replacePrefixMatch) ? self.type - == ''ReplacePrefixMatch'' : true' - type: object - required: - - type - type: object - x-kubernetes-validations: - - message: filter.requestHeaderModifier must be nil - if the filter.type is not RequestHeaderModifier - rule: '!(has(self.requestHeaderModifier) && self.type - != ''RequestHeaderModifier'')' - - message: filter.requestHeaderModifier must be specified - for RequestHeaderModifier filter.type - rule: '!(!has(self.requestHeaderModifier) && self.type - == ''RequestHeaderModifier'')' - - message: filter.responseHeaderModifier must be nil - if the filter.type is not ResponseHeaderModifier - rule: '!(has(self.responseHeaderModifier) && self.type - != ''ResponseHeaderModifier'')' - - message: filter.responseHeaderModifier must be specified - for ResponseHeaderModifier filter.type - rule: '!(!has(self.responseHeaderModifier) && self.type - == ''ResponseHeaderModifier'')' - - message: filter.requestMirror must be nil if the - filter.type is not RequestMirror - rule: '!(has(self.requestMirror) && self.type != - ''RequestMirror'')' - - message: filter.requestMirror must be specified - for RequestMirror filter.type - rule: '!(!has(self.requestMirror) && self.type == - ''RequestMirror'')' - - message: filter.requestRedirect must be nil if the - filter.type is not RequestRedirect - rule: '!(has(self.requestRedirect) && self.type - != ''RequestRedirect'')' - - message: filter.requestRedirect must be specified - for RequestRedirect filter.type - rule: '!(!has(self.requestRedirect) && self.type - == ''RequestRedirect'')' - - message: filter.urlRewrite must be nil if the filter.type - is not URLRewrite - rule: '!(has(self.urlRewrite) && self.type != ''URLRewrite'')' - - message: filter.urlRewrite must be specified for - URLRewrite filter.type - rule: '!(!has(self.urlRewrite) && self.type == ''URLRewrite'')' - - message: filter.extensionRef must be nil if the - filter.type is not ExtensionRef - rule: '!(has(self.extensionRef) && self.type != - ''ExtensionRef'')' - - message: filter.extensionRef must be specified for - ExtensionRef filter.type - rule: '!(!has(self.extensionRef) && self.type == - ''ExtensionRef'')' - maxItems: 16 - type: array - x-kubernetes-validations: - - message: May specify either httpRouteFilterRequestRedirect - or httpRouteFilterRequestRewrite, but not both - rule: '!(self.exists(f, f.type == ''RequestRedirect'') - && self.exists(f, f.type == ''URLRewrite''))' - - message: RequestHeaderModifier filter cannot be repeated - rule: self.filter(f, f.type == 'RequestHeaderModifier').size() - <= 1 - - message: ResponseHeaderModifier filter cannot be repeated - rule: self.filter(f, f.type == 'ResponseHeaderModifier').size() - <= 1 - - message: RequestRedirect filter cannot be repeated - rule: self.filter(f, f.type == 'RequestRedirect').size() - <= 1 - - message: URLRewrite filter cannot be repeated - rule: self.filter(f, f.type == 'URLRewrite').size() - <= 1 - matches: - default: - - path: - type: PathPrefix - value: / - items: - properties: - headers: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - type: - default: Exact - enum: - - Exact - - RegularExpression - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - method: - enum: - - GET - - HEAD - - POST - - PUT - - DELETE - - CONNECT - - OPTIONS - - TRACE - - PATCH - type: string - path: - default: - type: PathPrefix - value: / - properties: - type: - default: PathPrefix - enum: - - Exact - - PathPrefix - - RegularExpression - type: string - value: - default: / - maxLength: 1024 - type: string - type: object - x-kubernetes-validations: - - message: value must be an absolute path and - start with '/' when type one of ['Exact', - 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? self.value.startsWith(''/'') : true' - - message: must not contain '//' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''//'') : true' - - message: must not contain '/./' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''/./'') : true' - - message: must not contain '/../' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''/../'') : true' - - message: must not contain '%2f' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''%2f'') : true' - - message: must not contain '%2F' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''%2F'') : true' - - message: must not contain '#' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''#'') : true' - - message: must not end with '/..' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.endsWith(''/..'') : true' - - message: must not end with '/.' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.endsWith(''/.'') : true' - - message: type must be one of ['Exact', 'PathPrefix', - 'RegularExpression'] - rule: self.type in ['Exact','PathPrefix'] || - self.type == 'RegularExpression' - - message: must only contain valid characters - (matching ^(?:[-A-Za-z0-9/._~!$&'()*+,;=:@]|[%][0-9a-fA-F]{2})+$) - for types ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? self.value.matches(r"""^(?:[-A-Za-z0-9/._~!$&''()*+,;=:@]|[%][0-9a-fA-F]{2})+$""") - : true' - queryParams: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - type: - default: Exact - enum: - - Exact - - RegularExpression - type: string - value: - maxLength: 1024 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - maxItems: 64 - type: array - name: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - retry: - properties: - attempts: - type: integer - backoff: - pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ - type: string - codes: - items: - maximum: 599 - minimum: 400 - type: integer - type: array - type: object - sessionPersistence: - properties: - absoluteTimeout: - pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ - type: string - cookieConfig: - properties: - lifetimeType: - default: Session - enum: - - Permanent - - Session - type: string - type: object - idleTimeout: - pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ - type: string - sessionName: - maxLength: 128 - type: string - type: - default: Cookie - enum: - - Cookie - - Header - type: string - type: object - x-kubernetes-validations: - - message: AbsoluteTimeout must be specified when cookie - lifetimeType is Permanent - rule: '!has(self.cookieConfig) || !has(self.cookieConfig.lifetimeType) - || self.cookieConfig.lifetimeType != ''Permanent'' - || has(self.absoluteTimeout)' - timeouts: - properties: - backendRequest: - pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ - type: string - request: - pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ - type: string - type: object - x-kubernetes-validations: - - message: backendRequest timeout cannot be longer than - request timeout - rule: '!(has(self.request) && has(self.backendRequest) - && duration(self.request) != duration(''0s'') && - duration(self.backendRequest) > duration(self.request))' - type: object - x-kubernetes-validations: - - message: RequestRedirect filter must not be used together - with backendRefs - rule: '(has(self.backendRefs) && size(self.backendRefs) - > 0) ? (!has(self.filters) || self.filters.all(f, !has(f.requestRedirect))): - true' - - message: When using RequestRedirect filter with path.replacePrefixMatch, - exactly one PathPrefix match must be specified - rule: '(has(self.filters) && self.filters.exists_one(f, - has(f.requestRedirect) && has(f.requestRedirect.path) - && f.requestRedirect.path.type == ''ReplacePrefixMatch'' - && has(f.requestRedirect.path.replacePrefixMatch))) - ? ((size(self.matches) != 1 || !has(self.matches[0].path) - || self.matches[0].path.type != ''PathPrefix'') ? false - : true) : true' - - message: When using URLRewrite filter with path.replacePrefixMatch, - exactly one PathPrefix match must be specified - rule: '(has(self.filters) && self.filters.exists_one(f, - has(f.urlRewrite) && has(f.urlRewrite.path) && f.urlRewrite.path.type - == ''ReplacePrefixMatch'' && has(f.urlRewrite.path.replacePrefixMatch))) - ? ((size(self.matches) != 1 || !has(self.matches[0].path) - || self.matches[0].path.type != ''PathPrefix'') ? false - : true) : true' - - message: Within backendRefs, when using RequestRedirect - filter with path.replacePrefixMatch, exactly one PathPrefix - match must be specified - rule: '(has(self.backendRefs) && self.backendRefs.exists_one(b, - (has(b.filters) && b.filters.exists_one(f, has(f.requestRedirect) - && has(f.requestRedirect.path) && f.requestRedirect.path.type - == ''ReplacePrefixMatch'' && has(f.requestRedirect.path.replacePrefixMatch))) - )) ? ((size(self.matches) != 1 || !has(self.matches[0].path) - || self.matches[0].path.type != ''PathPrefix'') ? false - : true) : true' - - message: Within backendRefs, When using URLRewrite filter - with path.replacePrefixMatch, exactly one PathPrefix - match must be specified - rule: '(has(self.backendRefs) && self.backendRefs.exists_one(b, - (has(b.filters) && b.filters.exists_one(f, has(f.urlRewrite) - && has(f.urlRewrite.path) && f.urlRewrite.path.type - == ''ReplacePrefixMatch'' && has(f.urlRewrite.path.replacePrefixMatch))) - )) ? ((size(self.matches) != 1 || !has(self.matches[0].path) - || self.matches[0].path.type != ''PathPrefix'') ? false - : true) : true' - maxItems: 16 - type: array - x-kubernetes-validations: - - message: While 16 rules and 64 matches per rule are allowed, - the total number of matches across all rules in a route - must be less than 128 - rule: '(self.size() > 0 ? self[0].matches.size() : 0) + - (self.size() > 1 ? self[1].matches.size() : 0) + (self.size() - > 2 ? self[2].matches.size() : 0) + (self.size() > 3 ? - self[3].matches.size() : 0) + (self.size() > 4 ? self[4].matches.size() - : 0) + (self.size() > 5 ? self[5].matches.size() : 0) - + (self.size() > 6 ? self[6].matches.size() : 0) + (self.size() - > 7 ? self[7].matches.size() : 0) + (self.size() > 8 ? - self[8].matches.size() : 0) + (self.size() > 9 ? self[9].matches.size() - : 0) + (self.size() > 10 ? self[10].matches.size() : 0) - + (self.size() > 11 ? self[11].matches.size() : 0) + (self.size() - > 12 ? self[12].matches.size() : 0) + (self.size() > 13 - ? self[13].matches.size() : 0) + (self.size() > 14 ? self[14].matches.size() - : 0) + (self.size() > 15 ? self[15].matches.size() : 0) - <= 128' - type: object - status: - properties: - parents: - items: - properties: - conditions: - items: - properties: - lastTransitionTime: - format: date-time - type: string - message: - maxLength: 32768 - type: string - observedGeneration: - format: int64 - minimum: 0 - type: integer - reason: - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - enum: - - "True" - - "False" - - Unknown - type: string - type: - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - minItems: 1 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - controllerName: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ - type: string - parentRef: - properties: - group: - default: gateway.networking.k8s.io - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Gateway - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - sectionName: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - required: - - name - type: object - required: - - controllerName - - parentRef - type: object - maxItems: 32 - type: array - required: - - parents - type: object - required: - - spec - type: object + type: string rayClusterConfig: properties: autoscalerOptions: diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index 224b7096be8..09d812833b0 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -3,7 +3,6 @@ package v1 import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - gwv1 "sigs.k8s.io/gateway-api/apis/v1" ) // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! @@ -98,17 +97,17 @@ type RayServiceSpec struct { // ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. // +optional ServeService *corev1.Service `json:"serveService,omitempty"` - // Gateway is the Gateway object for the RayService to serve traffics during an IncrementalUpgrade. - Gateway *gwv1.Gateway `json:"gateway,omitempty"` - // HTTPRoute is the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade. - HTTPRoute *gwv1.HTTPRoute `json:"httpRoute,omitempty"` // UpgradeStrategy defines the scaling policy used when upgrading the RayService. // +optional UpgradeStrategy *RayServiceUpgradeStrategy `json:"upgradeStrategy,omitempty"` // Important: Run "make" to regenerate code after modifying this file // Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. // +optional - ServeConfigV2 string `json:"serveConfigV2,omitempty"` + ServeConfigV2 string `json:"serveConfigV2,omitempty"` + // Gateway is the name of the Gateway object for the RayService to serve traffics during an IncrementalUpgrade. + Gateway string `json:"gateway,omitempty"` + // HTTPRoute is the name of the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade. + HTTPRoute string `json:"httpRoute,omitempty"` RayClusterSpec RayClusterSpec `json:"rayClusterConfig"` // If the field is set to true, the value of the label `ray.io/serve` on the head Pod should always be false. // Therefore, the head Pod's endpoint will not be added to the Kubernetes Serve service. diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index 8200a01f43f..a2ce6c8e1d8 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -8,7 +8,6 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" - apisv1 "sigs.k8s.io/gateway-api/apis/v1" ) // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. @@ -724,16 +723,6 @@ func (in *RayServiceSpec) DeepCopyInto(out *RayServiceSpec) { *out = new(corev1.Service) (*in).DeepCopyInto(*out) } - if in.Gateway != nil { - in, out := &in.Gateway, &out.Gateway - *out = new(apisv1.Gateway) - (*in).DeepCopyInto(*out) - } - if in.HTTPRoute != nil { - in, out := &in.HTTPRoute, &out.HTTPRoute - *out = new(apisv1.HTTPRoute) - (*in).DeepCopyInto(*out) - } if in.UpgradeStrategy != nil { in, out := &in.UpgradeStrategy, &out.UpgradeStrategy *out = new(RayServiceUpgradeStrategy) diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index 02e449d4726..73e38364781 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -41,1764 +41,9 @@ spec: excludeHeadPodFromServeSvc: type: boolean gateway: - properties: - apiVersion: - type: string - kind: - type: string - metadata: - properties: - annotations: - additionalProperties: - type: string - type: object - finalizers: - items: - type: string - type: array - labels: - additionalProperties: - type: string - type: object - name: - type: string - namespace: - type: string - type: object - spec: - properties: - addresses: - items: - properties: - type: - default: IPAddress - maxLength: 253 - minLength: 1 - pattern: ^Hostname|IPAddress|NamedAddress|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ - type: string - value: - maxLength: 253 - minLength: 1 - type: string - required: - - value - type: object - x-kubernetes-validations: - - message: Hostname value must only contain valid characters - (matching ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$) - rule: 'self.type == ''Hostname'' ? self.value.matches(r"""^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"""): - true' - maxItems: 16 - type: array - x-kubernetes-validations: - - message: IPAddress values must be unique - rule: 'self.all(a1, a1.type == ''IPAddress'' ? self.exists_one(a2, - a2.type == a1.type && a2.value == a1.value) : true )' - - message: Hostname values must be unique - rule: 'self.all(a1, a1.type == ''Hostname'' ? self.exists_one(a2, - a2.type == a1.type && a2.value == a1.value) : true )' - backendTLS: - properties: - clientCertificateRef: - properties: - group: - default: "" - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Secret - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - required: - - name - type: object - type: object - gatewayClassName: - maxLength: 253 - minLength: 1 - type: string - infrastructure: - properties: - annotations: - additionalProperties: - maxLength: 4096 - minLength: 0 - type: string - maxProperties: 8 - type: object - x-kubernetes-validations: - - message: Annotation keys must be in the form of an optional - DNS subdomain prefix followed by a required name segment - of up to 63 characters. - rule: self.all(key, key.matches(r"""^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$""")) - - message: If specified, the annotation key's prefix must - be a DNS subdomain not longer than 253 characters - in total. - rule: self.all(key, key.split("/")[0].size() < 253) - labels: - additionalProperties: - maxLength: 63 - minLength: 0 - pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ - type: string - maxProperties: 8 - type: object - x-kubernetes-validations: - - message: Label keys must be in the form of an optional - DNS subdomain prefix followed by a required name segment - of up to 63 characters. - rule: self.all(key, key.matches(r"""^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$""")) - - message: If specified, the label key's prefix must be - a DNS subdomain not longer than 253 characters in - total. - rule: self.all(key, key.split("/")[0].size() < 253) - parametersRef: - properties: - group: - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - required: - - group - - kind - - name - type: object - type: object - listeners: - items: - properties: - allowedRoutes: - default: - namespaces: - from: Same - properties: - kinds: - items: - properties: - group: - default: gateway.networking.k8s.io - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - required: - - kind - type: object - maxItems: 8 - type: array - namespaces: - default: - from: Same - properties: - from: - default: Same - enum: - - All - - Selector - - Same - type: string - selector: - properties: - matchExpressions: - items: - properties: - key: - type: string - operator: - type: string - values: - items: - type: string - type: array - x-kubernetes-list-type: atomic - required: - - key - - operator - type: object - type: array - x-kubernetes-list-type: atomic - matchLabels: - additionalProperties: - type: string - type: object - type: object - x-kubernetes-map-type: atomic - type: object - type: object - hostname: - maxLength: 253 - minLength: 1 - pattern: ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - name: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - protocol: - maxLength: 255 - minLength: 1 - pattern: ^[a-zA-Z0-9]([-a-zA-Z0-9]*[a-zA-Z0-9])?$|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9]+$ - type: string - tls: - properties: - certificateRefs: - items: - properties: - group: - default: "" - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Secret - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - required: - - name - type: object - maxItems: 64 - type: array - frontendValidation: - properties: - caCertificateRefs: - items: - properties: - group: - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - required: - - group - - kind - - name - type: object - maxItems: 8 - minItems: 1 - type: array - type: object - mode: - default: Terminate - enum: - - Terminate - - Passthrough - type: string - options: - additionalProperties: - maxLength: 4096 - minLength: 0 - type: string - maxProperties: 16 - type: object - type: object - x-kubernetes-validations: - - message: certificateRefs or options must be specified - when mode is Terminate - rule: 'self.mode == ''Terminate'' ? size(self.certificateRefs) - > 0 || size(self.options) > 0 : true' - required: - - name - - port - - protocol - type: object - maxItems: 64 - minItems: 1 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - x-kubernetes-validations: - - message: tls must not be specified for protocols ['HTTP', - 'TCP', 'UDP'] - rule: 'self.all(l, l.protocol in [''HTTP'', ''TCP'', ''UDP''] - ? !has(l.tls) : true)' - - message: tls mode must be Terminate for protocol HTTPS - rule: 'self.all(l, (l.protocol == ''HTTPS'' && has(l.tls)) - ? (l.tls.mode == '''' || l.tls.mode == ''Terminate'') - : true)' - - message: hostname must not be specified for protocols ['TCP', - 'UDP'] - rule: 'self.all(l, l.protocol in [''TCP'', ''UDP''] ? (!has(l.hostname) - || l.hostname == '''') : true)' - - message: Listener name must be unique within the Gateway - rule: self.all(l1, self.exists_one(l2, l1.name == l2.name)) - - message: Combination of port, protocol and hostname must - be unique for each listener - rule: 'self.all(l1, self.exists_one(l2, l1.port == l2.port - && l1.protocol == l2.protocol && (has(l1.hostname) && - has(l2.hostname) ? l1.hostname == l2.hostname : !has(l1.hostname) - && !has(l2.hostname))))' - required: - - gatewayClassName - - listeners - type: object - status: - default: - conditions: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Accepted - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Programmed - properties: - addresses: - items: - properties: - type: - default: IPAddress - maxLength: 253 - minLength: 1 - pattern: ^Hostname|IPAddress|NamedAddress|[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ - type: string - value: - maxLength: 253 - minLength: 1 - type: string - required: - - value - type: object - x-kubernetes-validations: - - message: Hostname value must only contain valid characters - (matching ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$) - rule: 'self.type == ''Hostname'' ? self.value.matches(r"""^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$"""): - true' - maxItems: 16 - type: array - conditions: - default: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Accepted - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Programmed - items: - properties: - lastTransitionTime: - format: date-time - type: string - message: - maxLength: 32768 - type: string - observedGeneration: - format: int64 - minimum: 0 - type: integer - reason: - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - enum: - - "True" - - "False" - - Unknown - type: string - type: - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - listeners: - items: - properties: - attachedRoutes: - format: int32 - type: integer - conditions: - items: - properties: - lastTransitionTime: - format: date-time - type: string - message: - maxLength: 32768 - type: string - observedGeneration: - format: int64 - minimum: 0 - type: integer - reason: - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - enum: - - "True" - - "False" - - Unknown - type: string - type: - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - name: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - supportedKinds: - items: - properties: - group: - default: gateway.networking.k8s.io - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - required: - - kind - type: object - maxItems: 8 - type: array - required: - - attachedRoutes - - conditions - - name - - supportedKinds - type: object - maxItems: 64 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - required: - - spec - type: object + type: string httpRoute: - properties: - apiVersion: - type: string - kind: - type: string - metadata: - properties: - annotations: - additionalProperties: - type: string - type: object - finalizers: - items: - type: string - type: array - labels: - additionalProperties: - type: string - type: object - name: - type: string - namespace: - type: string - type: object - spec: - properties: - hostnames: - items: - maxLength: 253 - minLength: 1 - pattern: ^(\*\.)?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - maxItems: 16 - type: array - parentRefs: - items: - properties: - group: - default: gateway.networking.k8s.io - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Gateway - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - sectionName: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - required: - - name - type: object - maxItems: 32 - type: array - rules: - default: - - matches: - - path: - type: PathPrefix - value: / - items: - properties: - backendRefs: - items: - properties: - filters: - items: - properties: - extensionRef: - properties: - group: - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - required: - - group - - kind - - name - type: object - requestHeaderModifier: - properties: - add: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - remove: - items: - type: string - maxItems: 16 - type: array - x-kubernetes-list-type: set - set: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - requestMirror: - properties: - backendRef: - properties: - group: - default: "" - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Service - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - required: - - name - type: object - x-kubernetes-validations: - - message: Must have port for Service - reference - rule: '(size(self.group) == 0 && self.kind - == ''Service'') ? has(self.port) - : true' - fraction: - properties: - denominator: - default: 100 - format: int32 - minimum: 1 - type: integer - numerator: - format: int32 - minimum: 0 - type: integer - required: - - numerator - type: object - x-kubernetes-validations: - - message: numerator must be less than - or equal to denominator - rule: self.numerator <= self.denominator - percent: - format: int32 - maximum: 100 - minimum: 0 - type: integer - required: - - backendRef - type: object - requestRedirect: - properties: - hostname: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - path: - properties: - replaceFullPath: - maxLength: 1024 - type: string - replacePrefixMatch: - maxLength: 1024 - type: string - type: - enum: - - ReplaceFullPath - - ReplacePrefixMatch - type: string - required: - - type - type: object - x-kubernetes-validations: - - message: replaceFullPath must be specified - when type is set to 'ReplaceFullPath' - rule: 'self.type == ''ReplaceFullPath'' - ? has(self.replaceFullPath) : true' - - message: type must be 'ReplaceFullPath' - when replaceFullPath is set - rule: 'has(self.replaceFullPath) ? - self.type == ''ReplaceFullPath'' - : true' - - message: replacePrefixMatch must be - specified when type is set to 'ReplacePrefixMatch' - rule: 'self.type == ''ReplacePrefixMatch'' - ? has(self.replacePrefixMatch) : - true' - - message: type must be 'ReplacePrefixMatch' - when replacePrefixMatch is set - rule: 'has(self.replacePrefixMatch) - ? self.type == ''ReplacePrefixMatch'' - : true' - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - scheme: - enum: - - http - - https - type: string - statusCode: - default: 302 - enum: - - 301 - - 302 - type: integer - type: object - responseHeaderModifier: - properties: - add: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - remove: - items: - type: string - maxItems: 16 - type: array - x-kubernetes-list-type: set - set: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - type: - enum: - - RequestHeaderModifier - - ResponseHeaderModifier - - RequestMirror - - RequestRedirect - - URLRewrite - - ExtensionRef - type: string - urlRewrite: - properties: - hostname: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - path: - properties: - replaceFullPath: - maxLength: 1024 - type: string - replacePrefixMatch: - maxLength: 1024 - type: string - type: - enum: - - ReplaceFullPath - - ReplacePrefixMatch - type: string - required: - - type - type: object - x-kubernetes-validations: - - message: replaceFullPath must be specified - when type is set to 'ReplaceFullPath' - rule: 'self.type == ''ReplaceFullPath'' - ? has(self.replaceFullPath) : true' - - message: type must be 'ReplaceFullPath' - when replaceFullPath is set - rule: 'has(self.replaceFullPath) ? - self.type == ''ReplaceFullPath'' - : true' - - message: replacePrefixMatch must be - specified when type is set to 'ReplacePrefixMatch' - rule: 'self.type == ''ReplacePrefixMatch'' - ? has(self.replacePrefixMatch) : - true' - - message: type must be 'ReplacePrefixMatch' - when replacePrefixMatch is set - rule: 'has(self.replacePrefixMatch) - ? self.type == ''ReplacePrefixMatch'' - : true' - type: object - required: - - type - type: object - x-kubernetes-validations: - - message: filter.requestHeaderModifier must - be nil if the filter.type is not RequestHeaderModifier - rule: '!(has(self.requestHeaderModifier) && - self.type != ''RequestHeaderModifier'')' - - message: filter.requestHeaderModifier must - be specified for RequestHeaderModifier filter.type - rule: '!(!has(self.requestHeaderModifier) - && self.type == ''RequestHeaderModifier'')' - - message: filter.responseHeaderModifier must - be nil if the filter.type is not ResponseHeaderModifier - rule: '!(has(self.responseHeaderModifier) - && self.type != ''ResponseHeaderModifier'')' - - message: filter.responseHeaderModifier must - be specified for ResponseHeaderModifier - filter.type - rule: '!(!has(self.responseHeaderModifier) - && self.type == ''ResponseHeaderModifier'')' - - message: filter.requestMirror must be nil - if the filter.type is not RequestMirror - rule: '!(has(self.requestMirror) && self.type - != ''RequestMirror'')' - - message: filter.requestMirror must be specified - for RequestMirror filter.type - rule: '!(!has(self.requestMirror) && self.type - == ''RequestMirror'')' - - message: filter.requestRedirect must be nil - if the filter.type is not RequestRedirect - rule: '!(has(self.requestRedirect) && self.type - != ''RequestRedirect'')' - - message: filter.requestRedirect must be specified - for RequestRedirect filter.type - rule: '!(!has(self.requestRedirect) && self.type - == ''RequestRedirect'')' - - message: filter.urlRewrite must be nil if - the filter.type is not URLRewrite - rule: '!(has(self.urlRewrite) && self.type - != ''URLRewrite'')' - - message: filter.urlRewrite must be specified - for URLRewrite filter.type - rule: '!(!has(self.urlRewrite) && self.type - == ''URLRewrite'')' - - message: filter.extensionRef must be nil if - the filter.type is not ExtensionRef - rule: '!(has(self.extensionRef) && self.type - != ''ExtensionRef'')' - - message: filter.extensionRef must be specified - for ExtensionRef filter.type - rule: '!(!has(self.extensionRef) && self.type - == ''ExtensionRef'')' - maxItems: 16 - type: array - x-kubernetes-validations: - - message: May specify either httpRouteFilterRequestRedirect - or httpRouteFilterRequestRewrite, but not - both - rule: '!(self.exists(f, f.type == ''RequestRedirect'') - && self.exists(f, f.type == ''URLRewrite''))' - - message: May specify either httpRouteFilterRequestRedirect - or httpRouteFilterRequestRewrite, but not - both - rule: '!(self.exists(f, f.type == ''RequestRedirect'') - && self.exists(f, f.type == ''URLRewrite''))' - - message: RequestHeaderModifier filter cannot - be repeated - rule: self.filter(f, f.type == 'RequestHeaderModifier').size() - <= 1 - - message: ResponseHeaderModifier filter cannot - be repeated - rule: self.filter(f, f.type == 'ResponseHeaderModifier').size() - <= 1 - - message: RequestRedirect filter cannot be repeated - rule: self.filter(f, f.type == 'RequestRedirect').size() - <= 1 - - message: URLRewrite filter cannot be repeated - rule: self.filter(f, f.type == 'URLRewrite').size() - <= 1 - group: - default: "" - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Service - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - weight: - default: 1 - format: int32 - maximum: 1000000 - minimum: 0 - type: integer - required: - - name - type: object - x-kubernetes-validations: - - message: Must have port for Service reference - rule: '(size(self.group) == 0 && self.kind == ''Service'') - ? has(self.port) : true' - maxItems: 16 - type: array - filters: - items: - properties: - extensionRef: - properties: - group: - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - required: - - group - - kind - - name - type: object - requestHeaderModifier: - properties: - add: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - remove: - items: - type: string - maxItems: 16 - type: array - x-kubernetes-list-type: set - set: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - requestMirror: - properties: - backendRef: - properties: - group: - default: "" - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Service - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - required: - - name - type: object - x-kubernetes-validations: - - message: Must have port for Service reference - rule: '(size(self.group) == 0 && self.kind - == ''Service'') ? has(self.port) : true' - fraction: - properties: - denominator: - default: 100 - format: int32 - minimum: 1 - type: integer - numerator: - format: int32 - minimum: 0 - type: integer - required: - - numerator - type: object - x-kubernetes-validations: - - message: numerator must be less than or - equal to denominator - rule: self.numerator <= self.denominator - percent: - format: int32 - maximum: 100 - minimum: 0 - type: integer - required: - - backendRef - type: object - requestRedirect: - properties: - hostname: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - path: - properties: - replaceFullPath: - maxLength: 1024 - type: string - replacePrefixMatch: - maxLength: 1024 - type: string - type: - enum: - - ReplaceFullPath - - ReplacePrefixMatch - type: string - required: - - type - type: object - x-kubernetes-validations: - - message: replaceFullPath must be specified - when type is set to 'ReplaceFullPath' - rule: 'self.type == ''ReplaceFullPath'' - ? has(self.replaceFullPath) : true' - - message: type must be 'ReplaceFullPath' - when replaceFullPath is set - rule: 'has(self.replaceFullPath) ? self.type - == ''ReplaceFullPath'' : true' - - message: replacePrefixMatch must be specified - when type is set to 'ReplacePrefixMatch' - rule: 'self.type == ''ReplacePrefixMatch'' - ? has(self.replacePrefixMatch) : true' - - message: type must be 'ReplacePrefixMatch' - when replacePrefixMatch is set - rule: 'has(self.replacePrefixMatch) ? self.type - == ''ReplacePrefixMatch'' : true' - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - scheme: - enum: - - http - - https - type: string - statusCode: - default: 302 - enum: - - 301 - - 302 - type: integer - type: object - responseHeaderModifier: - properties: - add: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - remove: - items: - type: string - maxItems: 16 - type: array - x-kubernetes-list-type: set - set: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - type: - enum: - - RequestHeaderModifier - - ResponseHeaderModifier - - RequestMirror - - RequestRedirect - - URLRewrite - - ExtensionRef - type: string - urlRewrite: - properties: - hostname: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - path: - properties: - replaceFullPath: - maxLength: 1024 - type: string - replacePrefixMatch: - maxLength: 1024 - type: string - type: - enum: - - ReplaceFullPath - - ReplacePrefixMatch - type: string - required: - - type - type: object - x-kubernetes-validations: - - message: replaceFullPath must be specified - when type is set to 'ReplaceFullPath' - rule: 'self.type == ''ReplaceFullPath'' - ? has(self.replaceFullPath) : true' - - message: type must be 'ReplaceFullPath' - when replaceFullPath is set - rule: 'has(self.replaceFullPath) ? self.type - == ''ReplaceFullPath'' : true' - - message: replacePrefixMatch must be specified - when type is set to 'ReplacePrefixMatch' - rule: 'self.type == ''ReplacePrefixMatch'' - ? has(self.replacePrefixMatch) : true' - - message: type must be 'ReplacePrefixMatch' - when replacePrefixMatch is set - rule: 'has(self.replacePrefixMatch) ? self.type - == ''ReplacePrefixMatch'' : true' - type: object - required: - - type - type: object - x-kubernetes-validations: - - message: filter.requestHeaderModifier must be nil - if the filter.type is not RequestHeaderModifier - rule: '!(has(self.requestHeaderModifier) && self.type - != ''RequestHeaderModifier'')' - - message: filter.requestHeaderModifier must be specified - for RequestHeaderModifier filter.type - rule: '!(!has(self.requestHeaderModifier) && self.type - == ''RequestHeaderModifier'')' - - message: filter.responseHeaderModifier must be nil - if the filter.type is not ResponseHeaderModifier - rule: '!(has(self.responseHeaderModifier) && self.type - != ''ResponseHeaderModifier'')' - - message: filter.responseHeaderModifier must be specified - for ResponseHeaderModifier filter.type - rule: '!(!has(self.responseHeaderModifier) && self.type - == ''ResponseHeaderModifier'')' - - message: filter.requestMirror must be nil if the - filter.type is not RequestMirror - rule: '!(has(self.requestMirror) && self.type != - ''RequestMirror'')' - - message: filter.requestMirror must be specified - for RequestMirror filter.type - rule: '!(!has(self.requestMirror) && self.type == - ''RequestMirror'')' - - message: filter.requestRedirect must be nil if the - filter.type is not RequestRedirect - rule: '!(has(self.requestRedirect) && self.type - != ''RequestRedirect'')' - - message: filter.requestRedirect must be specified - for RequestRedirect filter.type - rule: '!(!has(self.requestRedirect) && self.type - == ''RequestRedirect'')' - - message: filter.urlRewrite must be nil if the filter.type - is not URLRewrite - rule: '!(has(self.urlRewrite) && self.type != ''URLRewrite'')' - - message: filter.urlRewrite must be specified for - URLRewrite filter.type - rule: '!(!has(self.urlRewrite) && self.type == ''URLRewrite'')' - - message: filter.extensionRef must be nil if the - filter.type is not ExtensionRef - rule: '!(has(self.extensionRef) && self.type != - ''ExtensionRef'')' - - message: filter.extensionRef must be specified for - ExtensionRef filter.type - rule: '!(!has(self.extensionRef) && self.type == - ''ExtensionRef'')' - maxItems: 16 - type: array - x-kubernetes-validations: - - message: May specify either httpRouteFilterRequestRedirect - or httpRouteFilterRequestRewrite, but not both - rule: '!(self.exists(f, f.type == ''RequestRedirect'') - && self.exists(f, f.type == ''URLRewrite''))' - - message: RequestHeaderModifier filter cannot be repeated - rule: self.filter(f, f.type == 'RequestHeaderModifier').size() - <= 1 - - message: ResponseHeaderModifier filter cannot be repeated - rule: self.filter(f, f.type == 'ResponseHeaderModifier').size() - <= 1 - - message: RequestRedirect filter cannot be repeated - rule: self.filter(f, f.type == 'RequestRedirect').size() - <= 1 - - message: URLRewrite filter cannot be repeated - rule: self.filter(f, f.type == 'URLRewrite').size() - <= 1 - matches: - default: - - path: - type: PathPrefix - value: / - items: - properties: - headers: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - type: - default: Exact - enum: - - Exact - - RegularExpression - type: string - value: - maxLength: 4096 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - method: - enum: - - GET - - HEAD - - POST - - PUT - - DELETE - - CONNECT - - OPTIONS - - TRACE - - PATCH - type: string - path: - default: - type: PathPrefix - value: / - properties: - type: - default: PathPrefix - enum: - - Exact - - PathPrefix - - RegularExpression - type: string - value: - default: / - maxLength: 1024 - type: string - type: object - x-kubernetes-validations: - - message: value must be an absolute path and - start with '/' when type one of ['Exact', - 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? self.value.startsWith(''/'') : true' - - message: must not contain '//' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''//'') : true' - - message: must not contain '/./' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''/./'') : true' - - message: must not contain '/../' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''/../'') : true' - - message: must not contain '%2f' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''%2f'') : true' - - message: must not contain '%2F' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''%2F'') : true' - - message: must not contain '#' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.contains(''#'') : true' - - message: must not end with '/..' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.endsWith(''/..'') : true' - - message: must not end with '/.' when type one - of ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? !self.value.endsWith(''/.'') : true' - - message: type must be one of ['Exact', 'PathPrefix', - 'RegularExpression'] - rule: self.type in ['Exact','PathPrefix'] || - self.type == 'RegularExpression' - - message: must only contain valid characters - (matching ^(?:[-A-Za-z0-9/._~!$&'()*+,;=:@]|[%][0-9a-fA-F]{2})+$) - for types ['Exact', 'PathPrefix'] - rule: '(self.type in [''Exact'',''PathPrefix'']) - ? self.value.matches(r"""^(?:[-A-Za-z0-9/._~!$&''()*+,;=:@]|[%][0-9a-fA-F]{2})+$""") - : true' - queryParams: - items: - properties: - name: - maxLength: 256 - minLength: 1 - pattern: ^[A-Za-z0-9!#$%&'*+\-.^_\x60|~]+$ - type: string - type: - default: Exact - enum: - - Exact - - RegularExpression - type: string - value: - maxLength: 1024 - minLength: 1 - type: string - required: - - name - - value - type: object - maxItems: 16 - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - type: object - maxItems: 64 - type: array - name: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - retry: - properties: - attempts: - type: integer - backoff: - pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ - type: string - codes: - items: - maximum: 599 - minimum: 400 - type: integer - type: array - type: object - sessionPersistence: - properties: - absoluteTimeout: - pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ - type: string - cookieConfig: - properties: - lifetimeType: - default: Session - enum: - - Permanent - - Session - type: string - type: object - idleTimeout: - pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ - type: string - sessionName: - maxLength: 128 - type: string - type: - default: Cookie - enum: - - Cookie - - Header - type: string - type: object - x-kubernetes-validations: - - message: AbsoluteTimeout must be specified when cookie - lifetimeType is Permanent - rule: '!has(self.cookieConfig) || !has(self.cookieConfig.lifetimeType) - || self.cookieConfig.lifetimeType != ''Permanent'' - || has(self.absoluteTimeout)' - timeouts: - properties: - backendRequest: - pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ - type: string - request: - pattern: ^([0-9]{1,5}(h|m|s|ms)){1,4}$ - type: string - type: object - x-kubernetes-validations: - - message: backendRequest timeout cannot be longer than - request timeout - rule: '!(has(self.request) && has(self.backendRequest) - && duration(self.request) != duration(''0s'') && - duration(self.backendRequest) > duration(self.request))' - type: object - x-kubernetes-validations: - - message: RequestRedirect filter must not be used together - with backendRefs - rule: '(has(self.backendRefs) && size(self.backendRefs) - > 0) ? (!has(self.filters) || self.filters.all(f, !has(f.requestRedirect))): - true' - - message: When using RequestRedirect filter with path.replacePrefixMatch, - exactly one PathPrefix match must be specified - rule: '(has(self.filters) && self.filters.exists_one(f, - has(f.requestRedirect) && has(f.requestRedirect.path) - && f.requestRedirect.path.type == ''ReplacePrefixMatch'' - && has(f.requestRedirect.path.replacePrefixMatch))) - ? ((size(self.matches) != 1 || !has(self.matches[0].path) - || self.matches[0].path.type != ''PathPrefix'') ? false - : true) : true' - - message: When using URLRewrite filter with path.replacePrefixMatch, - exactly one PathPrefix match must be specified - rule: '(has(self.filters) && self.filters.exists_one(f, - has(f.urlRewrite) && has(f.urlRewrite.path) && f.urlRewrite.path.type - == ''ReplacePrefixMatch'' && has(f.urlRewrite.path.replacePrefixMatch))) - ? ((size(self.matches) != 1 || !has(self.matches[0].path) - || self.matches[0].path.type != ''PathPrefix'') ? false - : true) : true' - - message: Within backendRefs, when using RequestRedirect - filter with path.replacePrefixMatch, exactly one PathPrefix - match must be specified - rule: '(has(self.backendRefs) && self.backendRefs.exists_one(b, - (has(b.filters) && b.filters.exists_one(f, has(f.requestRedirect) - && has(f.requestRedirect.path) && f.requestRedirect.path.type - == ''ReplacePrefixMatch'' && has(f.requestRedirect.path.replacePrefixMatch))) - )) ? ((size(self.matches) != 1 || !has(self.matches[0].path) - || self.matches[0].path.type != ''PathPrefix'') ? false - : true) : true' - - message: Within backendRefs, When using URLRewrite filter - with path.replacePrefixMatch, exactly one PathPrefix - match must be specified - rule: '(has(self.backendRefs) && self.backendRefs.exists_one(b, - (has(b.filters) && b.filters.exists_one(f, has(f.urlRewrite) - && has(f.urlRewrite.path) && f.urlRewrite.path.type - == ''ReplacePrefixMatch'' && has(f.urlRewrite.path.replacePrefixMatch))) - )) ? ((size(self.matches) != 1 || !has(self.matches[0].path) - || self.matches[0].path.type != ''PathPrefix'') ? false - : true) : true' - maxItems: 16 - type: array - x-kubernetes-validations: - - message: While 16 rules and 64 matches per rule are allowed, - the total number of matches across all rules in a route - must be less than 128 - rule: '(self.size() > 0 ? self[0].matches.size() : 0) + - (self.size() > 1 ? self[1].matches.size() : 0) + (self.size() - > 2 ? self[2].matches.size() : 0) + (self.size() > 3 ? - self[3].matches.size() : 0) + (self.size() > 4 ? self[4].matches.size() - : 0) + (self.size() > 5 ? self[5].matches.size() : 0) - + (self.size() > 6 ? self[6].matches.size() : 0) + (self.size() - > 7 ? self[7].matches.size() : 0) + (self.size() > 8 ? - self[8].matches.size() : 0) + (self.size() > 9 ? self[9].matches.size() - : 0) + (self.size() > 10 ? self[10].matches.size() : 0) - + (self.size() > 11 ? self[11].matches.size() : 0) + (self.size() - > 12 ? self[12].matches.size() : 0) + (self.size() > 13 - ? self[13].matches.size() : 0) + (self.size() > 14 ? self[14].matches.size() - : 0) + (self.size() > 15 ? self[15].matches.size() : 0) - <= 128' - type: object - status: - properties: - parents: - items: - properties: - conditions: - items: - properties: - lastTransitionTime: - format: date-time - type: string - message: - maxLength: 32768 - type: string - observedGeneration: - format: int64 - minimum: 0 - type: integer - reason: - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - enum: - - "True" - - "False" - - Unknown - type: string - type: - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - minItems: 1 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - controllerName: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*\/[A-Za-z0-9\/\-._~%!$&'()*+,;=:]+$ - type: string - parentRef: - properties: - group: - default: gateway.networking.k8s.io - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: Gateway - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - maxLength: 253 - minLength: 1 - type: string - namespace: - maxLength: 63 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ - type: string - port: - format: int32 - maximum: 65535 - minimum: 1 - type: integer - sectionName: - maxLength: 253 - minLength: 1 - pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - required: - - name - type: object - required: - - controllerName - - parentRef - type: object - maxItems: 32 - type: array - required: - - parents - type: object - required: - - spec - type: object + type: string rayClusterConfig: properties: autoscalerOptions: diff --git a/ray-operator/controllers/ray/common/association.go b/ray-operator/controllers/ray/common/association.go index 1f2ce2a9270..7b60a29152a 100644 --- a/ray-operator/controllers/ray/common/association.go +++ b/ray-operator/controllers/ray/common/association.go @@ -205,15 +205,27 @@ func RayClusterNetworkResourcesOptions(instance *rayv1.RayCluster) AssociationOp } func RayServiceGatewayNamespacedName(rayService *rayv1.RayService) types.NamespacedName { + var gatewayName string + if rayService.Spec.Gateway != "" { + gatewayName = rayService.Spec.Gateway + } else { + gatewayName = fmt.Sprintf("%s-gateway", rayService.Name) + } return types.NamespacedName{ - Name: fmt.Sprintf("%s-%s", rayService.Name, "gateway"), + Name: gatewayName, Namespace: rayService.Namespace, } } func RayServiceHTTPRouteNamespacedName(rayService *rayv1.RayService) types.NamespacedName { + var httpRouteName string + if rayService.Spec.HTTPRoute != "" { + httpRouteName = rayService.Spec.HTTPRoute + } else { + httpRouteName = fmt.Sprintf("httproute-%s", rayService.Name) + } return types.NamespacedName{ - Name: fmt.Sprintf("httproute-%s", rayService.Name), + Name: httpRouteName, Namespace: rayService.Namespace, } } diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 673ed83b3f8..b8d380f71e2 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -147,21 +147,25 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque } // Check if IncrementalUpgrade is enabled, if so reconcile Gateway objects. - if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && activeRayClusterInstance != nil { + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { // Creates a Gateway CR that points to the head services of // the active and pending (if it exists) RayClusters. For incremental upgrades, // the Gateway endpoint is used rather than the Serve service. gateway, err := r.reconcileGateway(ctx, rayServiceInstance) if err != nil { - return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) + } + if gateway != nil { + rayServiceInstance.Spec.Gateway = gateway.Name } - rayServiceInstance.Spec.Gateway = gateway // Create or update the HTTPRoute attached to this RayService's Gateway httpRoute, err := r.reconcileHTTPRoute(ctx, rayServiceInstance) if err != nil { - return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) + } + if httpRoute != nil { + rayServiceInstance.Spec.HTTPRoute = httpRoute.Name } - rayServiceInstance.Spec.HTTPRoute = httpRoute } // Reconcile serve applications for active and/or pending clusters @@ -452,7 +456,13 @@ func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayServic if options == nil { return nil, errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade") } - gatewayName := rayServiceInstance.Name + "-gateway" + + var gatewayName string + if rayServiceInstance.Spec.Gateway != "" { + gatewayName = rayServiceInstance.Spec.Gateway + } else { + gatewayName = rayServiceInstance.Name + "-gateway" + } // Define the desired Gateway object rayServiceGateway := &gwv1.Gateway{ @@ -482,6 +492,10 @@ func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceI logger.Error(err, "Failed to build Gateway object for Rayservice") return nil, err } + if desiredGateway == nil { + logger.Info("Skipping Gateway reconciliation: desired Gateway is nil") + return nil, nil + } // Check for existing RayService Gateway, create the desired Gateway if none is found existingGateway := &gwv1.Gateway{} @@ -527,7 +541,12 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn } // Define the desired HTTPRoute name and basic object - httpRouteName := fmt.Sprintf("httproute-%s", rayServiceInstance.Name) + var httpRouteName string + if rayServiceInstance.Spec.HTTPRoute != "" { + httpRouteName = rayServiceInstance.Spec.HTTPRoute + } else { + httpRouteName = fmt.Sprintf("httproute-%s", rayServiceInstance.Name) + } desiredHTTPRoute := &gwv1.HTTPRoute{ ObjectMeta: metav1.ObjectMeta{ Name: httpRouteName, @@ -547,10 +566,14 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn // Retrieve the active RayCluster activeRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServiceActiveRayClusterNamespacedName(rayServiceInstance)) - if err != nil || activeRayCluster == nil || activeRayCluster.Status.Head.ServiceName == "" { - logger.Info("No active RayCluster, skipping HTTPRoute creation") + if err != nil && !errors.IsNotFound(err) { + logger.Error(err, "Failed to retrieve active RayCluster") return nil, err } + if activeRayCluster == nil || activeRayCluster.Status.Head.ServiceName == "" { + logger.Info("Active RayCluster not found, skipping HTTPRoute creation.") + return nil, nil + } oldClusterHeadSvcName := activeRayCluster.Status.Head.ServiceName oldHeadSvc := &corev1.Service{} if err := r.Get(ctx, client.ObjectKey{Name: oldClusterHeadSvcName, Namespace: rayServiceInstance.Namespace}, oldHeadSvc); err != nil { @@ -691,6 +714,10 @@ func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServic logger.Error(err, "Failed to build HTTPRoute for RayService upgrade") return nil, err } + if desiredHTTPRoute == nil { + logger.Info("Skipping HTTPRoute reconciliation: desired HTTPRoute is nil") + return nil, nil + } // Check for existing HTTPRoute for RayService existingHTTPRoute := &gwv1.HTTPRoute{} diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 1b2b4679061..ca3908f3d25 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1564,23 +1564,31 @@ func TestReconcileHTTPRoute(t *testing.T) { ctx := context.TODO() namespace := "test-ns" - // Create runtime objects for RayService + rayService := makeIncrementalUpgradeRayService( + true, + "incremental-ray-service-gateway", + ptr.To(int32(20)), + ptr.To(int32(30)), + ptr.To(int32(80)), + ptr.To(metav1.Now()), + ) + activeService := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ Name: "active-service", - Namespace: "test-ns", + Namespace: namespace, }, } pendingService := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ Name: "pending-service", - Namespace: "test-ns", + Namespace: namespace, }, } activeCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ Name: "active-ray-cluster", - Namespace: "test-ns", + Namespace: namespace, }, Status: rayv1.RayClusterStatus{ Head: rayv1.HeadInfo{ @@ -1591,7 +1599,7 @@ func TestReconcileHTTPRoute(t *testing.T) { pendingCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ Name: "pending-ray-cluster", - Namespace: "test-ns", + Namespace: namespace, }, Status: rayv1.RayClusterStatus{ Head: rayv1.HeadInfo{ @@ -1606,43 +1614,62 @@ func TestReconcileHTTPRoute(t *testing.T) { }, } - // Prepare RayService instance - rayService := makeIncrementalUpgradeRayService(true, "test-gateway", ptr.To(int32(20)), ptr.To(int32(30)), ptr.To(int32(80)), ptr.To(metav1.Time{Time: time.Now()})) - runtimeObjects := []runtime.Object{rayService, activeService, pendingService, activeCluster, pendingCluster, gateway} - - fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() - reconciler := RayServiceReconciler{ - Client: fakeClient, - Scheme: newScheme, - Recorder: record.NewFakeRecorder(10), + // Pre-existing HTTPRoute with incorrect weights + existingHTTPRoute := makeHTTPRoute(fmt.Sprintf("httproute-%s", rayService.Name), namespace, true) + existingHTTPRoute.Spec = gwv1.HTTPRouteSpec{ + CommonRouteSpec: gwv1.CommonRouteSpec{ + ParentRefs: []gwv1.ParentReference{{ + Name: gwv1.ObjectName("incremental-ray-service-gateway"), + Namespace: ptr.To(gwv1.Namespace(namespace)), + }}, + }, + Rules: []gwv1.HTTPRouteRule{{ + BackendRefs: []gwv1.HTTPBackendRef{ + { + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{ + Name: "active-service", + Namespace: ptr.To(gwv1.Namespace(namespace)), + Port: ptr.To(gwv1.PortNumber(8000)), + }, + Weight: ptr.To(int32(5)), + }, + }, + { + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{ + Name: "pending-service", + Namespace: ptr.To(gwv1.Namespace(namespace)), + Port: ptr.To(gwv1.PortNumber(8000)), + }, + Weight: ptr.To(int32(95)), + }, + }, + }, + }}, } tests := []struct { name string - setupHTTPRoute func(r *RayServiceReconciler, rs *rayv1.RayService) *gwv1.HTTPRoute expectedRouteName string + runtimeObjects []runtime.Object expectedWeight int32 }{ { - name: "creates new HTTPRoute if Spec.HTTPRoute is nil", - setupHTTPRoute: func(_ *RayServiceReconciler, rs *rayv1.RayService) *gwv1.HTTPRoute { - rs.Spec.HTTPRoute = nil - return nil + name: "creates new HTTPRoute if not present", + runtimeObjects: []runtime.Object{ + rayService, activeService, pendingService, + activeCluster, pendingCluster, gateway, }, expectedRouteName: "httproute-incremental-ray-service", expectedWeight: 80, }, { - name: "updates existing HTTPRoute if spec differs", - setupHTTPRoute: func(r *RayServiceReconciler, rs *rayv1.RayService) *gwv1.HTTPRoute { - desired, err := r.createHTTPRoute(ctx, rs) - require.NoError(t, err) - - // Modify weight to trigger update - existing := desired.DeepCopy() - existing.Spec.Rules[0].BackendRefs[0].Weight = ptr.To(int32(5)) - rs.Spec.HTTPRoute = existing - return desired + name: "updates HTTPRoute if spec differs", + runtimeObjects: []runtime.Object{ + rayService, activeService, pendingService, + activeCluster, pendingCluster, gateway, + existingHTTPRoute, }, expectedRouteName: "httproute-incremental-ray-service", expectedWeight: 80, @@ -1651,22 +1678,31 @@ func TestReconcileHTTPRoute(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - tt.setupHTTPRoute(&reconciler, rayService) + fakeClient := clientFake.NewClientBuilder(). + WithScheme(newScheme). + WithRuntimeObjects(tt.runtimeObjects...). + Build() + + reconciler := RayServiceReconciler{ + Client: fakeClient, + Scheme: newScheme, + Recorder: record.NewFakeRecorder(10), + } route, err := reconciler.reconcileHTTPRoute(ctx, rayService) require.NoError(t, err) require.NotNil(t, route) + assert.Equal(t, tt.expectedRouteName, route.Name) assert.Equal(t, namespace, route.Namespace) - // Check updated weights match expected + require.Len(t, route.Spec.Rules[0].BackendRefs, 2) assert.Equal(t, tt.expectedWeight, *route.Spec.Rules[0].BackendRefs[0].Weight) assert.Equal(t, 100-tt.expectedWeight, *route.Spec.Rules[0].BackendRefs[1].Weight) - // Check ParentRef refers to the expected Gateway parent := route.Spec.ParentRefs[0] assert.Equal(t, gwv1.ObjectName("incremental-ray-service-gateway"), parent.Name) - assert.Equal(t, ptr.To(gwv1.Namespace("test-ns")), parent.Namespace) + assert.Equal(t, ptr.To(gwv1.Namespace(namespace)), parent.Namespace) }) } } @@ -1680,60 +1716,33 @@ func TestReconcileGateway(t *testing.T) { ctx := context.TODO() namespace := "test-ns" - // Prepare RayService instance rayService := makeIncrementalUpgradeRayService( true, "gateway-class", ptr.To(int32(20)), ptr.To(int32(30)), ptr.To(int32(80)), - ptr.To(metav1.Time{Time: time.Now()}), + ptr.To(metav1.Now()), ) - - runtimeObjects := []runtime.Object{ - rayService, - rayService.Spec.ServeService, - } - - fakeClient := clientFake.NewClientBuilder(). - WithScheme(newScheme). - WithRuntimeObjects(runtimeObjects...). - Build() - - reconciler := RayServiceReconciler{ - Client: fakeClient, - Scheme: newScheme, - Recorder: record.NewFakeRecorder(10), - } + gateway := makeGateway(fmt.Sprintf("%s-gateway", rayService.Name), rayService.Namespace, true) tests := []struct { name string - setupGateway func(r *RayServiceReconciler, rs *rayv1.RayService) *gwv1.Gateway expectedGatewayName string expectedClass string + runtimeObjects []runtime.Object expectedNumListeners int }{ { - name: "creates new Gateway if Spec.Gateway is missing during incremental upgrade", - setupGateway: func(_ *RayServiceReconciler, rs *rayv1.RayService) *gwv1.Gateway { - rs.Spec.Gateway = nil - return nil - }, + name: "creates new Gateway if missing", + runtimeObjects: []runtime.Object{rayService}, expectedGatewayName: "incremental-ray-service-gateway", expectedClass: "gateway-class", expectedNumListeners: 1, }, { - name: "update existing Gateway if desired Gateway spec differs", - setupGateway: func(r *RayServiceReconciler, rs *rayv1.RayService) *gwv1.Gateway { - desired, err := r.createGateway(rs) - require.NoError(t, err) - - existing := desired.DeepCopy() - existing.Spec.GatewayClassName = "some-other-class" - rs.Spec.Gateway = existing - return existing - }, + name: "updates Gateway if spec differs", + runtimeObjects: []runtime.Object{rayService, gateway}, expectedGatewayName: "incremental-ray-service-gateway", expectedClass: "gateway-class", expectedNumListeners: 1, @@ -1742,7 +1751,16 @@ func TestReconcileGateway(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - tt.setupGateway(&reconciler, rayService) + fakeClient := clientFake.NewClientBuilder(). + WithScheme(newScheme). + WithRuntimeObjects(tt.runtimeObjects...). + Build() + + reconciler := RayServiceReconciler{ + Client: fakeClient, + Scheme: newScheme, + Recorder: record.NewFakeRecorder(10), + } gw, err := reconciler.reconcileGateway(ctx, rayService) require.NoError(t, err) @@ -1856,6 +1874,10 @@ func makeGateway(name, namespace string, isReady bool) *gwv1.Gateway { Type: string(gwv1.GatewayConditionAccepted), Status: status, }, + { + Type: string(gwv1.GatewayConditionProgrammed), + Status: status, + }, }, }, } diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 7d083a0e426..825515d5582 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -681,14 +681,20 @@ func IsGatewayReady(gatewayInstance *gwv1.Gateway) bool { if gatewayInstance == nil { return false } + hasAccepted := false + hasProgrammed := false + for _, condition := range gatewayInstance.Status.Conditions { if condition.Type == string(gwv1.GatewayConditionAccepted) && condition.Status == metav1.ConditionTrue { - return true + hasAccepted = true + } + if condition.Type == string(gwv1.GatewayConditionProgrammed) && condition.Status == metav1.ConditionTrue { + hasProgrammed = true } } - // If no accepted condition found then it is not ready yet - return false + // If no ready condition found return false + return hasAccepted && hasProgrammed } // IsHTTPRouteReady returns whether the HTTPRoute associated with a given Gateway has a ready condition diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index bf762bfe42b..c6e47944207 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -1251,16 +1251,23 @@ func TestCalculateResources(t *testing.T) { } // helper function to return a Gateway object with GatewayStatus Conditions for testing. -func makeGatewayWithCondition(accepted bool) *gwv1.Gateway { +func makeGatewayWithCondition(accepted bool, programmed bool) *gwv1.Gateway { var conditions []metav1.Condition + if accepted { - conditions = []metav1.Condition{ - { - Type: string(gwv1.GatewayConditionAccepted), - Status: metav1.ConditionTrue, - }, - } + conditions = append(conditions, metav1.Condition{ + Type: string(gwv1.GatewayConditionAccepted), + Status: metav1.ConditionTrue, + }) } + + if programmed { + conditions = append(conditions, metav1.Condition{ + Type: string(gwv1.GatewayConditionProgrammed), + Status: metav1.ConditionTrue, + }) + } + return &gwv1.Gateway{ Status: gwv1.GatewayStatus{ Conditions: conditions, @@ -1280,13 +1287,18 @@ func TestIsGatewayReady(t *testing.T) { expected: false, }, { - name: "Gateway created but missing accepted condition", - gateway: makeGatewayWithCondition(false), + name: "Gateway created with Programmed condition only", + gateway: makeGatewayWithCondition(false, true), + expected: false, + }, + { + name: "Gateway created with Accepted condition only", + gateway: makeGatewayWithCondition(true, false), expected: false, }, { - name: "Gateway created with accepted condition", - gateway: makeGatewayWithCondition(true), + name: "Gateway created with both Accepted and Programmed conditions", + gateway: makeGatewayWithCondition(true, true), expected: true, }, } diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go index 9426baa9b12..ad31b98af96 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go @@ -4,7 +4,6 @@ package v1 import ( corev1 "k8s.io/api/core/v1" - apisv1 "sigs.k8s.io/gateway-api/apis/v1" ) // RayServiceSpecApplyConfiguration represents a declarative configuration of the RayServiceSpec type for use @@ -14,10 +13,10 @@ type RayServiceSpecApplyConfiguration struct { ServiceUnhealthySecondThreshold *int32 `json:"serviceUnhealthySecondThreshold,omitempty"` DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"` ServeService *corev1.Service `json:"serveService,omitempty"` - Gateway *apisv1.Gateway `json:"gateway,omitempty"` - HTTPRoute *apisv1.HTTPRoute `json:"httpRoute,omitempty"` UpgradeStrategy *RayServiceUpgradeStrategyApplyConfiguration `json:"upgradeStrategy,omitempty"` ServeConfigV2 *string `json:"serveConfigV2,omitempty"` + Gateway *string `json:"gateway,omitempty"` + HTTPRoute *string `json:"httpRoute,omitempty"` RayClusterSpec *RayClusterSpecApplyConfiguration `json:"rayClusterConfig,omitempty"` ExcludeHeadPodFromServeSvc *bool `json:"excludeHeadPodFromServeSvc,omitempty"` } @@ -60,22 +59,6 @@ func (b *RayServiceSpecApplyConfiguration) WithServeService(value corev1.Service return b } -// WithGateway sets the Gateway field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Gateway field is set to the value of the last call. -func (b *RayServiceSpecApplyConfiguration) WithGateway(value apisv1.Gateway) *RayServiceSpecApplyConfiguration { - b.Gateway = &value - return b -} - -// WithHTTPRoute sets the HTTPRoute field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the HTTPRoute field is set to the value of the last call. -func (b *RayServiceSpecApplyConfiguration) WithHTTPRoute(value apisv1.HTTPRoute) *RayServiceSpecApplyConfiguration { - b.HTTPRoute = &value - return b -} - // WithUpgradeStrategy sets the UpgradeStrategy field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the UpgradeStrategy field is set to the value of the last call. @@ -92,6 +75,22 @@ func (b *RayServiceSpecApplyConfiguration) WithServeConfigV2(value string) *RayS return b } +// WithGateway sets the Gateway field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Gateway field is set to the value of the last call. +func (b *RayServiceSpecApplyConfiguration) WithGateway(value string) *RayServiceSpecApplyConfiguration { + b.Gateway = &value + return b +} + +// WithHTTPRoute sets the HTTPRoute field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the HTTPRoute field is set to the value of the last call. +func (b *RayServiceSpecApplyConfiguration) WithHTTPRoute(value string) *RayServiceSpecApplyConfiguration { + b.HTTPRoute = &value + return b +} + // WithRayClusterSpec sets the RayClusterSpec field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the RayClusterSpec field is set to the value of the last call. diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go index 6bfefa1f18f..435e9868328 100644 --- a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -10,6 +10,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" "github.com/ray-project/kuberay/ray-operator/pkg/features" "github.com/ray-project/kuberay/ray-operator/test/sampleyaml" @@ -58,13 +59,22 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { g.Expect(err).NotTo(HaveOccurred()) // Validate Gateway and HTTPRoute objects have been created for incremental upgrade. + gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway") + LogWithTimestamp(test.T(), "Waiting for Gateway %s/%s to be ready", rayService.Namespace, gatewayName) + g.Eventually(Gateway(test, rayService.Namespace, gatewayName), TestTimeoutMedium). + Should(WithTransform(utils.IsGatewayReady, BeTrue())) + gateway, err := GetGateway(test, namespace.Name, fmt.Sprintf("%s-%s", rayServiceName, "gateway")) g.Expect(err).NotTo(HaveOccurred()) g.Expect(gateway).NotTo(BeNil()) + httpRouteName := fmt.Sprintf("%s-%s", "httproute", rayServiceName) + LogWithTimestamp(test.T(), "Waiting for HTTPRoute %s/%s to be ready", rayService.Namespace, httpRouteName) + g.Eventually(HTTPRoute(test, rayService.Namespace, httpRouteName), TestTimeoutMedium). + Should(Not(BeNil())) httpRoute, err := GetHTTPRoute(test, namespace.Name, fmt.Sprintf("%s-%s", "httproute", rayServiceName)) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(httpRoute).NotTo(BeNil()) + g.Expect(utils.IsHTTPRouteReady(gateway, httpRoute)).To(BeTrue()) // Create curl pod to test traffic routing through Gateway to RayService curlPodName := "curl-pod" @@ -72,6 +82,7 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { curlPod, err := CreateCurlPod(test, curlPodName, curlContainerName, namespace.Name) g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Waiting for Curl Pod %s to be ready", curlPodName) g.Eventually(func(g Gomega) *corev1.Pod { updatedPod, err := test.Client().Core().CoreV1().Pods(curlPod.Namespace).Get(test.Ctx(), curlPod.Name, metav1.GetOptions{}) g.Expect(err).NotTo(HaveOccurred()) @@ -104,6 +115,7 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { g.Expect(err).NotTo(HaveOccurred()) // Check that upgrade steps incrementally with traffic/capacity split between clusters + LogWithTimestamp(test.T(), "Validating gradual traffic migration during IncrementalUpgrade") g.Eventually(func(g Gomega) { rayService, err := GetRayService(test, namespace.Name, rayServiceName) g.Expect(err).NotTo(HaveOccurred()) diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go index ba09a6822a3..b35bdb83926 100644 --- a/ray-operator/test/e2eincrementalupgrade/support.go +++ b/ray-operator/test/e2eincrementalupgrade/support.go @@ -5,12 +5,13 @@ import ( "fmt" corev1 "k8s.io/api/core/v1" - "k8s.io/utils/ptr" + "k8s.io/apimachinery/pkg/api/resource" + corev1ac "k8s.io/client-go/applyconfigurations/core/v1" gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" - e2eRayService "github.com/ray-project/kuberay/ray-operator/test/e2erayservice" . "github.com/ray-project/kuberay/ray-operator/test/support" ) @@ -56,21 +57,110 @@ func CurlRayServiceGateway( func IncrementalUpgradeRayServiceApplyConfiguration( stepSizePercent, intervalSeconds, maxSurgePercent *int32, ) *rayv1ac.RayServiceSpecApplyConfiguration { - spec := e2eRayService.RayServiceSampleYamlApplyConfiguration() - - spec.RayClusterSpec.EnableInTreeAutoscaling = ptr.To(true) - spec.WithUpgradeStrategy(rayv1ac.RayServiceUpgradeStrategy(). - WithType(rayv1.IncrementalUpgrade). - WithIncrementalUpgradeOptions( - rayv1ac.IncrementalUpgradeOptions(). - WithGatewayClassName("istio"). - WithStepSizePercent(*stepSizePercent). - WithIntervalSeconds(*intervalSeconds). - WithMaxSurgePercent(*maxSurgePercent), - ), - ) - - return spec + return rayv1ac.RayServiceSpec(). + WithUpgradeStrategy(rayv1ac.RayServiceUpgradeStrategy(). + WithType(rayv1.IncrementalUpgrade). + WithIncrementalUpgradeOptions( + rayv1ac.IncrementalUpgradeOptions(). + WithGatewayClassName("istio"). + WithStepSizePercent(*stepSizePercent). + WithIntervalSeconds(*intervalSeconds). + WithMaxSurgePercent(*maxSurgePercent), + )). + WithServeConfigV2(`applications: + - name: fruit_app + import_path: fruit.deployment_graph + route_prefix: /fruit + runtime_env: + working_dir: "https://github.com/ray-project/test_dag/archive/78b4a5da38796123d9f9ffff59bab2792a043e95.zip" + deployments: + - name: MangoStand + num_replicas: 1 + user_config: + price: 3 + ray_actor_options: + num_cpus: 0.1 + - name: OrangeStand + num_replicas: 1 + user_config: + price: 2 + ray_actor_options: + num_cpus: 0.1 + - name: FruitMarket + num_replicas: 1 + ray_actor_options: + num_cpus: 0.1 + - name: math_app + import_path: conditional_dag.serve_dag + route_prefix: /calc + runtime_env: + working_dir: "https://github.com/ray-project/test_dag/archive/78b4a5da38796123d9f9ffff59bab2792a043e95.zip" + deployments: + - name: Adder + num_replicas: 1 + user_config: + increment: 3 + ray_actor_options: + num_cpus: 0.1 + - name: Multiplier + num_replicas: 1 + user_config: + factor: 5 + ray_actor_options: + num_cpus: 0.1 + - name: Router + ray_actor_options: + num_cpus: 0.1 + num_replicas: 1`). + WithRayClusterSpec(rayv1ac.RayClusterSpec(). + WithRayVersion(GetRayVersion()). + WithEnableInTreeAutoscaling(true). + WithHeadGroupSpec(rayv1ac.HeadGroupSpec(). + WithRayStartParams(map[string]string{"dashboard-host": "0.0.0.0"}). + WithTemplate(corev1ac.PodTemplateSpec(). + WithSpec(corev1ac.PodSpec(). + WithRestartPolicy(corev1.RestartPolicyNever). + WithContainers(corev1ac.Container(). + WithName("ray-head"). + WithImage(GetRayImage()). + WithEnv(corev1ac.EnvVar().WithName(utils.RAY_ENABLE_AUTOSCALER_V2).WithValue("1")). + WithPorts( + corev1ac.ContainerPort().WithName(utils.GcsServerPortName).WithContainerPort(utils.DefaultGcsServerPort), + corev1ac.ContainerPort().WithName(utils.ServingPortName).WithContainerPort(utils.DefaultServingPort), + corev1ac.ContainerPort().WithName(utils.DashboardPortName).WithContainerPort(utils.DefaultDashboardPort), + corev1ac.ContainerPort().WithName(utils.ClientPortName).WithContainerPort(utils.DefaultClientPort), + ). + WithResources(corev1ac.ResourceRequirements(). + WithRequests(corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("3Gi"), + }). + WithLimits(corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("3Gi"), + })))))). + WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec(). + WithReplicas(0). + WithMinReplicas(0). + WithMaxReplicas(4). + WithRayStartParams(map[string]string{"num-cpus": "1"}). + WithGroupName("small-group"). + WithTemplate(corev1ac.PodTemplateSpec(). + WithSpec(corev1ac.PodSpec(). + WithRestartPolicy(corev1.RestartPolicyNever). + WithContainers(corev1ac.Container(). + WithName("ray-worker"). + WithImage(GetRayImage()). + WithResources(corev1ac.ResourceRequirements(). + WithRequests(corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("300m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }). + WithLimits(corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + })))))), + ) } // GetGatewayIP retrieves the external IP for a Gateway object From d6781f8c2f5b8277f21ad1df56c248c46dda5c6d Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 4 Jun 2025 03:46:04 +0000 Subject: [PATCH 03/56] Fix merge errors Signed-off-by: Ryan O'Leary --- go.mod | 7 +- go.sum | 13 ++-- .../controllers/ray/utils/util_test.go | 71 ------------------- ray-operator/go.mod | 25 +++---- ray-operator/go.sum | 46 +++++------- 5 files changed, 42 insertions(+), 120 deletions(-) diff --git a/go.mod b/go.mod index 472e6d593df..79ffcd37522 100644 --- a/go.mod +++ b/go.mod @@ -36,6 +36,9 @@ require ( k8s.io/kubectl v0.33.1 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 sigs.k8s.io/controller-runtime v0.21.0 + sigs.k8s.io/gateway-api v1.3.0 + sigs.k8s.io/structured-merge-diff/v4 v4.7.0 + sigs.k8s.io/controller-runtime v0.21.0 sigs.k8s.io/yaml v1.4.0 ) @@ -73,7 +76,7 @@ require ( github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect - github.com/mattn/go-isatty v0.0.19 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect github.com/mitchellh/go-wordwrap v1.0.1 // indirect github.com/moby/spdystream v0.5.0 // indirect github.com/moby/term v0.5.0 // indirect @@ -97,7 +100,7 @@ require ( go.uber.org/zap v1.27.0 // indirect golang.org/x/net v0.38.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sync v0.12.0 // indirect + golang.org/x/sync v0.13.0 // indirect golang.org/x/sys v0.32.0 // indirect golang.org/x/term v0.30.0 // indirect golang.org/x/text v0.23.0 // indirect diff --git a/go.sum b/go.sum index dddab9f7e86..7b4c83540b9 100644 --- a/go.sum +++ b/go.sum @@ -139,7 +139,6 @@ github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUt github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= @@ -263,8 +262,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= +golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= +golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= @@ -274,8 +273,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= -golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= +golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -292,8 +291,8 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= -golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= +golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index c6e47944207..2d87c12ac46 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -1479,77 +1479,6 @@ func TestGetRayServiceIncrementalUpgradeOptions(t *testing.T) { } } -// func TestGetGatewayListenersForServeService(t *testing.T) { -// tests := []struct { -// name string -// serveService *corev1.Service -// expectedListeners []gwv1.Listener -// }{ -// { -// name: "Return listeners for empty Serve Service", -// serveService: &corev1.Service{}, -// expectedListeners: []gwv1.Listener{}, -// }, -// { -// name: "Return listener for valid Serve Service with single ports", -// serveService: &corev1.Service{ -// ObjectMeta: metav1.ObjectMeta{ -// Name: "serve-service", -// }, -// Spec: corev1.ServiceSpec{ -// Ports: []corev1.ServicePort{{Port: 8000}}, -// }, -// }, -// expectedListeners: []gwv1.Listener{ -// { -// Name: "serve-service-listener", -// Protocol: gwv1.HTTPProtocolType, -// Port: 8000, -// }, -// }, -// }, -// { -// name: "Return listeners for valid Serve Service with multiple ports", -// serveService: &corev1.Service{ -// ObjectMeta: metav1.ObjectMeta{ -// Name: "serve-service", -// }, -// Spec: corev1.ServiceSpec{ -// Ports: []corev1.ServicePort{ -// { -// Name: "default-port", -// Port: 8000, -// }, -// { -// Name: "some-other-port", -// Port: 8500, -// }, -// }, -// }, -// }, -// expectedListeners: []gwv1.Listener{ -// { -// Name: "serve-service-default-port-listener", -// Protocol: gwv1.HTTPProtocolType, -// Port: 8000, -// }, -// { -// Name: "serve-service-some-other-port-listener", -// Protocol: gwv1.HTTPProtocolType, -// Port: 8500, -// }, -// }, -// }, -// } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - listeners := GetGatewayListenersForServeService(tt.serveService) - assert.Equal(t, tt.expectedListeners, listeners) - }) - } -} - func TestGetContainerCommand(t *testing.T) { tests := []struct { name string diff --git a/ray-operator/go.mod b/ray-operator/go.mod index 94d155da29f..78f3870ae24 100644 --- a/ray-operator/go.mod +++ b/ray-operator/go.mod @@ -4,22 +4,21 @@ go 1.24.0 require ( github.com/Masterminds/semver/v3 v3.3.1 + github.com/coder/websocket v1.8.13 github.com/go-logr/logr v1.4.3 github.com/go-logr/zapr v1.3.0 - github.com/google/go-cmp v0.7.0 github.com/jarcoal/httpmock v1.4.0 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 github.com/openshift/api v0.0.0-20250602203052-b29811a290c7 github.com/orcaman/concurrent-map/v2 v2.0.1 - github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.22.0 + github.com/spf13/pflag v1.0.6 github.com/stretchr/testify v1.10.0 go.uber.org/mock v0.5.2 go.uber.org/zap v1.27.0 gopkg.in/natefinch/lumberjack.v2 v2.2.1 k8s.io/api v0.33.1 - k8s.io/apiextensions-apiserver v0.33.1 k8s.io/apimachinery v0.33.1 k8s.io/apiserver v0.33.1 k8s.io/client-go v0.33.1 @@ -28,6 +27,7 @@ require ( k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 sigs.k8s.io/controller-runtime v0.21.0 + sigs.k8s.io/gateway-api v1.3.0 sigs.k8s.io/scheduler-plugins v0.31.8 sigs.k8s.io/structured-merge-diff/v4 v4.7.0 sigs.k8s.io/yaml v1.4.0 @@ -38,19 +38,19 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/coder/websocket v1.8.13 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/emicklei/go-restful/v3 v3.12.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect @@ -62,11 +62,11 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.62.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect - github.com/spf13/pflag v1.0.5 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/otel v1.33.0 // indirect @@ -74,19 +74,20 @@ require ( go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/mod v0.24.0 // indirect - golang.org/x/net v0.38.0 // indirect + golang.org/x/net v0.39.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect - golang.org/x/sync v0.12.0 // indirect + golang.org/x/sync v0.13.0 // indirect golang.org/x/sys v0.32.0 // indirect - golang.org/x/term v0.30.0 // indirect - golang.org/x/text v0.23.0 // indirect + golang.org/x/term v0.31.0 // indirect + golang.org/x/text v0.24.0 // indirect golang.org/x/time v0.9.0 // indirect golang.org/x/tools v0.31.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/protobuf v1.36.5 // indirect + google.golang.org/protobuf v1.36.6 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/apiextensions-apiserver v0.33.1 // indirect k8s.io/gengo/v2 v2.0.0-20250207200755-1244d31929d7 // indirect k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect diff --git a/ray-operator/go.sum b/ray-operator/go.sum index 6d6e0b27493..2d1825ab836 100644 --- a/ray-operator/go.sum +++ b/ray-operator/go.sum @@ -10,13 +10,12 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/coder/websocket v1.8.13 h1:f3QZdXy7uGVz+4uCJy2nTZyM0yTBj8yANEHhqlXZ9FE= github.com/coder/websocket v1.8.13/go.mod h1:LNVeNrXQZfe5qhS9ALED3uA+l5pPqvwXg3CKoDBB2gs= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= -github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk= +github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U= github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= @@ -29,12 +28,10 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= -github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= -github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= +github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= @@ -67,11 +64,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= @@ -116,17 +110,12 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= @@ -158,26 +147,26 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= +golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= +golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= -golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= +golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= -golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= +golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= +golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= +golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -192,8 +181,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= -google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= +google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= @@ -203,7 +192,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw= @@ -230,6 +218,8 @@ k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= +sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M= +sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= From 21bc32ab9913215dcd99d74ae6cd680676096e63 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 4 Jun 2025 05:23:34 +0000 Subject: [PATCH 04/56] Manually sync rbac for gateway Signed-off-by: Ryan O'Leary --- helm-chart/kuberay-operator/templates/_helpers.tpl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/helm-chart/kuberay-operator/templates/_helpers.tpl b/helm-chart/kuberay-operator/templates/_helpers.tpl index 5d14510a61b..a827c92ad4b 100644 --- a/helm-chart/kuberay-operator/templates/_helpers.tpl +++ b/helm-chart/kuberay-operator/templates/_helpers.tpl @@ -222,6 +222,19 @@ rules: - patch - update - watch +- apiGroups: + - gateway.networking.k8s.io + resources: + - gateways + - httproutes + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - networking.k8s.io resources: From f66b3a3f672e2cd1c34186e051a8a6ab1ad63f8c Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 4 Jun 2025 12:39:00 +0000 Subject: [PATCH 05/56] Fix bugs and e2e test Signed-off-by: Ryan O'Leary --- ray-operator/apis/ray/v1/rayservice_types.go | 59 ++++++-- .../config/crd/bases/ray.io_rayservices.yaml | 3 + .../controllers/ray/rayservice_controller.go | 140 +++++++++++++++++- .../ray/rayservice_controller_unit_test.go | 11 +- .../controllers/ray/utils/consistency.go | 14 +- ray-operator/main.go | 2 +- .../rayservice_incremental_upgrade_test.go | 129 ++++++++++------ .../test/e2eincrementalupgrade/support.go | 25 +--- 8 files changed, 281 insertions(+), 102 deletions(-) diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index 09d812833b0..2f8fdab4f3b 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -117,22 +117,50 @@ type RayServiceSpec struct { // RayServiceStatuses defines the observed state of RayService type RayServiceStatuses struct { - LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"` - ServiceStatus ServiceStatus `json:"serviceStatus,omitempty"` - Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` - ActiveServiceStatus RayServiceStatus `json:"activeServiceStatus,omitempty"` - PendingServiceStatus RayServiceStatus `json:"pendingServiceStatus,omitempty"` - ObservedGeneration int64 `json:"observedGeneration,omitempty"` - NumServeEndpoints int32 `json:"numServeEndpoints,omitempty"` + // Represents the latest available observations of a RayService's current state. + // +patchMergeKey=type + // +patchStrategy=merge + // +listType=map + // +listMapKey=type + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` + // LastUpdateTime represents the timestamp when the RayService status was last updated. + // +optional + LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"` + // Deprecated: `ServiceStatus` is deprecated - use `Conditions` instead. `Running` means the RayService is ready to + // serve requests. An empty `ServiceStatus` means the RayService is not ready to serve requests. The definition of + // `ServiceStatus` is equivalent to the `RayServiceReady` condition. + // +optional + ServiceStatus ServiceStatus `json:"serviceStatus,omitempty"` + // +optional + ActiveServiceStatus RayServiceStatus `json:"activeServiceStatus,omitempty"` + // Pending Service Status indicates a RayCluster will be created or is being created. + // +optional + PendingServiceStatus RayServiceStatus `json:"pendingServiceStatus,omitempty"` + // NumServeEndpoints indicates the number of Ray Pods that are actively serving or have been selected by the serve service. + // Ray Pods without a proxy actor or those that are unhealthy will not be counted. + // +optional + NumServeEndpoints int32 `json:"numServeEndpoints,omitempty"` + // observedGeneration is the most recent generation observed for this RayService. It corresponds to the + // RayService's generation, which is updated on mutation by the API Server. + // +optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` } type RayServiceStatus struct { - Applications map[string]AppStatus `json:"applicationStatuses,omitempty"` - TargetCapacity *int32 `json:"targetCapacity,omitempty"` - TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"` - LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"` - RayClusterName string `json:"rayClusterName,omitempty"` - RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"` + // Important: Run "make" to regenerate code after modifying this file + // +optional + Applications map[string]AppStatus `json:"applicationStatuses,omitempty"` + // +optional + TargetCapacity *int32 `json:"targetCapacity,omitempty"` + // +optional + TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"` + // +optional + LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"` + // +optional + RayClusterName string `json:"rayClusterName,omitempty"` + // +optional + RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"` } type AppStatus struct { @@ -184,8 +212,9 @@ const ( type RayService struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - Spec RayServiceSpec `json:"spec,omitempty"` - Status RayServiceStatuses `json:"status,omitempty"` + Spec RayServiceSpec `json:"spec,omitempty"` + // +optional + Status RayServiceStatuses `json:"status,omitempty"` } //+kubebuilder:object:root=true diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index 73e38364781..9cd7c7cc10b 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -8452,6 +8452,9 @@ spec: - type type: object type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map lastUpdateTime: format: date-time type: string diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index b8d380f71e2..e8c055bdef2 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -190,6 +190,11 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } + } else if activeRayClusterInstance != nil && utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + logger.Info("Reconciling the Serve applications for active cluster during IncrementalUpgrade", "clusterName", activeRayClusterInstance.Name) + if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } } // Reconcile K8s services and make sure it points to the correct RayCluster. @@ -389,6 +394,118 @@ func setCondition(rayServiceInstance *rayv1.RayService, conditionType rayv1.RayS meta.SetStatusCondition(&rayServiceInstance.Status.Conditions, condition) } +// Checks whether the old and new RayServiceStatus are inconsistent by comparing different fields. +// The RayClusterStatus field is only for observability in RayService CR, and changes to it will not trigger the status update. +func inconsistentRayServiceStatus(ctx context.Context, oldStatus rayv1.RayServiceStatus, newStatus rayv1.RayServiceStatus) bool { + logger := ctrl.LoggerFrom(ctx) + if oldStatus.RayClusterName != newStatus.RayClusterName { + logger.Info("inconsistentRayServiceStatus RayService RayClusterName", "oldRayClusterName", oldStatus.RayClusterName, "newRayClusterName", newStatus.RayClusterName) + return true + } + + if len(oldStatus.Applications) != len(newStatus.Applications) { + return true + } + + var ok bool + for appName, newAppStatus := range newStatus.Applications { + var oldAppStatus rayv1.AppStatus + if oldAppStatus, ok = oldStatus.Applications[appName]; !ok { + logger.Info("inconsistentRayServiceStatus RayService new application found", "appName", appName) + return true + } + + if oldAppStatus.Status != newAppStatus.Status { + logger.Info("inconsistentRayServiceStatus RayService application status changed", "appName", appName, "oldStatus", oldAppStatus.Status, "newStatus", newAppStatus.Status) + return true + } else if oldAppStatus.Message != newAppStatus.Message { + logger.Info("inconsistentRayServiceStatus RayService application status message changed", "appName", appName, "oldStatus", oldAppStatus.Message, "newStatus", newAppStatus.Message) + return true + } + + if len(oldAppStatus.Deployments) != len(newAppStatus.Deployments) { + return true + } + + for deploymentName, newDeploymentStatus := range newAppStatus.Deployments { + var oldDeploymentStatus rayv1.ServeDeploymentStatus + if oldDeploymentStatus, ok = oldAppStatus.Deployments[deploymentName]; !ok { + logger.Info("inconsistentRayServiceStatus RayService new deployment found in application", "deploymentName", deploymentName, "appName", appName) + return true + } + + if oldDeploymentStatus.Status != newDeploymentStatus.Status { + logger.Info("inconsistentRayServiceStatus RayService DeploymentStatus changed", "oldDeploymentStatus", oldDeploymentStatus.Status, "newDeploymentStatus", newDeploymentStatus.Status) + return true + } else if oldDeploymentStatus.Message != newDeploymentStatus.Message { + logger.Info("inconsistentRayServiceStatus RayService deployment status message changed", "oldDeploymentStatus", oldDeploymentStatus.Message, "newDeploymentStatus", newDeploymentStatus.Message) + return true + } + } + } + + if features.Enabled(features.RayServiceIncrementalUpgrade) { + // Also check for changes in IncrementalUpgrade related Status fields. + if (oldStatus.TrafficRoutedPercent == nil) != (newStatus.TrafficRoutedPercent == nil) || + (oldStatus.TrafficRoutedPercent != nil && newStatus.TrafficRoutedPercent != nil && + *oldStatus.TrafficRoutedPercent != *newStatus.TrafficRoutedPercent) { + logger.Info("inconsistentRayServiceStatus RayService updated TrafficRoutedPercent", + "old TrafficRoutedPercent", oldStatus.TrafficRoutedPercent, + "new TrafficRoutedPercent", newStatus.TrafficRoutedPercent) + return true + } + if (oldStatus.TargetCapacity == nil) != (newStatus.TargetCapacity == nil) || + (oldStatus.TargetCapacity != nil && newStatus.TargetCapacity != nil && + *oldStatus.TargetCapacity != *newStatus.TargetCapacity) { + logger.Info("inconsistentRayServiceStatus RayService updated TargetCapacity", + "old TargetCapacity", oldStatus.TargetCapacity, + "new TargetCapacity", newStatus.TargetCapacity) + return true + } + if (oldStatus.LastTrafficMigratedTime == nil) != (newStatus.LastTrafficMigratedTime == nil) || + (oldStatus.LastTrafficMigratedTime != nil && newStatus.LastTrafficMigratedTime != nil && + !oldStatus.LastTrafficMigratedTime.Equal(newStatus.LastTrafficMigratedTime)) { + logger.Info("inconsistentRayServiceStatus RayService updated LastTrafficMigratedTime", + "old LastTrafficMigratedTime", oldStatus.LastTrafficMigratedTime, + "new LastTrafficMigratedTime", newStatus.LastTrafficMigratedTime) + return true + } + } + + return false +} + +// Determine whether to update the status of the RayService instance. +func inconsistentRayServiceStatuses(ctx context.Context, oldStatus rayv1.RayServiceStatuses, newStatus rayv1.RayServiceStatuses) bool { + logger := ctrl.LoggerFrom(ctx) + if oldStatus.ServiceStatus != newStatus.ServiceStatus { + logger.Info("inconsistentRayServiceStatus RayService ServiceStatus changed", "oldServiceStatus", oldStatus.ServiceStatus, "newServiceStatus", newStatus.ServiceStatus) + return true + } + + if oldStatus.NumServeEndpoints != newStatus.NumServeEndpoints { + logger.Info("inconsistentRayServiceStatus RayService NumServeEndpoints changed", "oldNumServeEndpoints", oldStatus.NumServeEndpoints, "newNumServeEndpoints", newStatus.NumServeEndpoints) + return true + } + + if !reflect.DeepEqual(oldStatus.Conditions, newStatus.Conditions) { + logger.Info("inconsistentRayServiceStatus RayService Conditions changed") + return true + } + + if inconsistentRayServiceStatus(ctx, oldStatus.ActiveServiceStatus, newStatus.ActiveServiceStatus) { + logger.Info("inconsistentRayServiceStatus RayService ActiveServiceStatus changed") + return true + } + + if inconsistentRayServiceStatus(ctx, oldStatus.PendingServiceStatus, newStatus.PendingServiceStatus) { + logger.Info("inconsistentRayServiceStatus RayService PendingServiceStatus changed") + return true + } + + return false +} + // SetupWithManager sets up the controller with the Manager. func (r *RayServiceReconciler) SetupWithManager(mgr ctrl.Manager, reconcileConcurrency int) error { return ctrl.NewControllerManagedBy(mgr). @@ -1156,9 +1273,9 @@ func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.C return true, "Active RayCluster TargetCapacity has not finished scaling down." } -// updateServeTargetCapacity reconcile the target_capacity of the ServeConfig for a given RayCluster during +// reconcileServeTargetCapacity reconciles the target_capacity of the ServeConfig for a given RayCluster during // an IncrementalUpgrade while also updating the Status.TargetCapacity of the Active and Pending RayServices. -func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayDashboardClient utils.RayDashboardClientInterface) error { +func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient utils.RayDashboardClientInterface) error { logger := ctrl.LoggerFrom(ctx) logger.Info("reconcileServeTargetCapacity", "RayService", rayServiceInstance.Name) @@ -1200,18 +1317,26 @@ func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, // scaled up traffic and the active RayCluster can be scaled down by MaxSurgePercent. // 2. The total target_capacity is equal to 100. This means the pending RayCluster can // increase its target_capacity by MaxSurgePercent. + // If the rayClusterInstance passed into this function is not the cluster to update based + // on the above conditions, we return without doing anything. var clusterName string var goalTargetCapacity int32 if activeTargetCapacity+pendingTargetCapacity > int32(100) { // Scale down the Active RayCluster TargetCapacity on this iteration. goalTargetCapacity = max(int32(0), activeTargetCapacity-maxSurgePercent) clusterName = activeRayServiceStatus.RayClusterName + if clusterName != rayClusterInstance.Name { + return nil + } activeRayServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) logger.Info("Setting target_capacity for active Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity) } else { // Scale up the Pending RayCluster TargetCapacity on this iteration. goalTargetCapacity = min(int32(100), pendingTargetCapacity+maxSurgePercent) clusterName = pendingRayServiceStatus.RayClusterName + if clusterName != rayClusterInstance.Name { + return nil + } pendingRayServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) logger.Info("Setting target_capacity for pending Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity) } @@ -1230,6 +1355,7 @@ func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, // Check if ServeConfig requires update if currentTargetCapacity, ok := serveConfig["target_capacity"].(float64); ok { if int32(currentTargetCapacity) == goalTargetCapacity { + logger.Info("target_capacity already updated on RayCluster", "RayCluster", clusterName, "target_capacity", currentTargetCapacity) // No update required, return early return nil } @@ -1457,15 +1583,17 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns incrementalUpgradeUpdate, reason := r.checkIfNeedIncrementalUpgradeUpdate(ctx, rayServiceInstance) logger.Info("checkIfNeedIncrementalUpgradeUpdate", "incrementalUpgradeUpdate", incrementalUpgradeUpdate, "reason", reason) if incrementalUpgradeUpdate { - if err := r.reconcileServeTargetCapacity(ctx, rayServiceInstance, rayDashboardClient); err != nil { - r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateTargetCapacity), "Failed to update target_capacity of serve applications to the RayService %s/%s: %v", rayServiceInstance.Namespace, rayServiceInstance.Name, err) + if err := r.reconcileServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient); err != nil { + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateTargetCapacity), "Failed to update target_capacity of serve applications to the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, err) return false, serveApplications, err } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeTargetCapacity), - "Updated target_capacity of serve applications to the RayService %s/%s", rayServiceInstance.Namespace, rayServiceInstance.Name) + "Updated target_capacity of serve applications to to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name) // Don't switch to the pending RayCluster until IncrementalUpgrade is complete. - return false, serveApplications, nil + if rayServiceInstance.Status.PendingServiceStatus.RayClusterName == rayClusterInstance.Name { + return false, serveApplications, nil + } } } diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index ca3908f3d25..f82a05f0576 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1776,6 +1776,7 @@ func TestReconcileGateway(t *testing.T) { func TestReconcileServeTargetCapacity(t *testing.T) { features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true) + tests := []struct { name string updatedCluster string @@ -1834,11 +1835,19 @@ func TestReconcileServeTargetCapacity(t *testing.T) { }, } + var rayCluster *rayv1.RayCluster + if tt.updatedCluster == "active" { + rayCluster = &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "active"}} + } else { + rayCluster = &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "pending"}} + } + fakeDashboard := &utils.FakeRayDashboardClient{} reconciler := &RayServiceReconciler{ ServeConfigs: lru.New(10), // empty initial cache } - err := reconciler.reconcileServeTargetCapacity(ctx, rayService, fakeDashboard) + + err := reconciler.reconcileServeTargetCapacity(ctx, rayService, rayCluster, fakeDashboard) require.NoError(t, err) require.NotEmpty(t, fakeDashboard.LastUpdatedConfig) diff --git a/ray-operator/controllers/ray/utils/consistency.go b/ray-operator/controllers/ray/utils/consistency.go index 929e07d6658..ba3d28824b0 100644 --- a/ray-operator/controllers/ray/utils/consistency.go +++ b/ray-operator/controllers/ray/utils/consistency.go @@ -4,6 +4,7 @@ import ( "reflect" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" + "github.com/ray-project/kuberay/ray-operator/pkg/features" ) // Checks whether the old and new RayClusterStatus are inconsistent by comparing different fields. If the only @@ -31,16 +32,9 @@ func InconsistentRayClusterStatus(oldStatus rayv1.RayClusterStatus, newStatus ra } if features.Enabled(features.RayServiceIncrementalUpgrade) { // Also check for changes in IncrementalUpgrade related Status fields. - if oldStatus.TrafficRoutedPercent != newStatus.TrafficRoutedPercent { - logger.Info("inconsistentRayServiceStatus RayService updated TrafficRoutedPercent", "old TrafficRoutedPercent", oldStatus.TrafficRoutedPercent, "new TrafficRoutedPercent", newStatus.TrafficRoutedPercent) - return true - } - if oldStatus.TargetCapacity != newStatus.TargetCapacity { - logger.Info("inconsistentRayServiceStatus RayService updated TargetCapacity", "old TargetCapacity", oldStatus.TargetCapacity, "new TargetCapacity", newStatus.TargetCapacity) - return true - } - if oldStatus.LastTrafficMigratedTime != newStatus.LastTrafficMigratedTime { - logger.Info("inconsistentRayServiceStatus RayService updated LastTrafficMigratedTime", "old LastTrafficMigratedTime", oldStatus.LastTrafficMigratedTime, "new LastTrafficMigratedTime", newStatus.LastTrafficMigratedTime) + if oldStatus.TrafficRoutedPercent != newStatus.TrafficRoutedPercent || + oldStatus.TargetCapacity != newStatus.TargetCapacity || + oldStatus.LastTrafficMigratedTime != newStatus.LastTrafficMigratedTime { return true } } diff --git a/ray-operator/main.go b/ray-operator/main.go index 897cc7f58e2..ceba7d4772e 100644 --- a/ray-operator/main.go +++ b/ray-operator/main.go @@ -192,7 +192,7 @@ func main() { } features.LogFeatureGates(setupLog) - if features.Enabled(features.RayClusterStatusConditions) { + if features.Enabled(features.RayServiceIncrementalUpgrade) { utilruntime.Must(gwv1.AddToScheme(scheme)) } diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go index 435e9868328..ce3bb1f1757 100644 --- a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -4,9 +4,11 @@ import ( "fmt" "strings" "testing" + "time" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" @@ -41,9 +43,9 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { rayServiceName := "incremental-rayservice" // Create a RayService with IncrementalUpgrade enabled - stepSize := ptr.To(int32(20)) - interval := ptr.To(int32(30)) - maxSurge := ptr.To(int32(10)) + stepSize := ptr.To(int32(25)) + interval := ptr.To(int32(10)) + maxSurge := ptr.To(int32(50)) rayServiceAC := rayv1ac.RayService(rayServiceName, namespace.Name). WithSpec(IncrementalUpgradeRayServiceApplyConfiguration(stepSize, interval, maxSurge)) @@ -99,10 +101,11 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/calc", `["MUL", 3]`) g.Expect(stdout.String()).To(Equal("15 pizzas please!")) - // Trigger incremental upgrade by updating RayService serve config + // Trigger incremental upgrade by updating RayService serve config and RayCluster spec rayService, err = GetRayService(test, namespace.Name, rayService.Name) g.Expect(err).NotTo(HaveOccurred()) + rayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests["CPU"] = resource.MustParse("500m") serveConfig := rayService.Spec.ServeConfigV2 serveConfig = strings.Replace(serveConfig, "price: 3", "price: 4", -1) serveConfig = strings.Replace(serveConfig, "factor: 5", "factor: 3", -1) @@ -114,56 +117,88 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { ) g.Expect(err).NotTo(HaveOccurred()) - // Check that upgrade steps incrementally with traffic/capacity split between clusters - LogWithTimestamp(test.T(), "Validating gradual traffic migration during IncrementalUpgrade") - g.Eventually(func(g Gomega) { - rayService, err := GetRayService(test, namespace.Name, rayServiceName) - g.Expect(err).NotTo(HaveOccurred()) + LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be true", rayService.Namespace, rayService.Name) + g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeTrue())) - g.Expect(rayService.Status.PendingServiceStatus).NotTo(BeNil()) - g.Expect(rayService.Status.PendingServiceStatus.TrafficRoutedPercent).NotTo(BeNil()) - g.Expect(rayService.Status.PendingServiceStatus.TargetCapacity).NotTo(BeNil()) - g.Expect(rayService.Status.ActiveServiceStatus).NotTo(BeNil()) - g.Expect(rayService.Status.ActiveServiceStatus.TrafficRoutedPercent).NotTo(BeNil()) - g.Expect(rayService.Status.ActiveServiceStatus.TargetCapacity).NotTo(BeNil()) - - for _, val := range []int32{ - *rayService.Status.PendingServiceStatus.TrafficRoutedPercent, - *rayService.Status.ActiveServiceStatus.TrafficRoutedPercent, - *rayService.Status.PendingServiceStatus.TargetCapacity, - *rayService.Status.ActiveServiceStatus.TargetCapacity, - } { - g.Expect(val).To(BeNumerically(">", 0)) - g.Expect(val).To(BeNumerically("<", 100)) - } - }, TestTimeoutMedium).Should(Succeed()) + LogWithTimestamp(test.T(), "Validating stepwise traffic and capacity migration") + stepSizeVal := *stepSize + intervalVal := *interval + maxSurgeVal := *maxSurge + + var lastPendingCapacity, lastPendingTraffic, lastActiveCapacity, lastActiveTraffic int32 - // Validate that traffic is split across old and new clusters of the RayService - g.Eventually(func(g Gomega) { + // Validate expected behavior during IncrementalUpgrade + for { + // Wait IntervalSeconds in between updates + time.Sleep(time.Duration(intervalVal) * time.Second) + + // Fetch updated RayService rayService, err := GetRayService(test, namespace.Name, rayServiceName) g.Expect(err).NotTo(HaveOccurred()) - activeSvcName := rayService.Status.ActiveServiceStatus.RayClusterStatus.Head.ServiceName - pendingSvcName := rayService.Status.PendingServiceStatus.RayClusterStatus.Head.ServiceName + pending := rayService.Status.PendingServiceStatus + active := rayService.Status.ActiveServiceStatus - activeResp, _ := CurlRayServiceHeadService( - test, activeSvcName, rayService, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) - pendingResp, _ := CurlRayServiceHeadService( - test, pendingSvcName, rayService, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + if pending.RayClusterName == "" { + // No pending cluster - upgrade has completed + break + } - // Both clusters should still be serving traffic during the split - g.Expect(activeResp.String()).To(Equal("6")) - g.Expect(pendingResp.String()).To(Equal("6")) - }, TestTimeoutMedium).Should(Succeed()) + // Incremental Upgrade related status fields should be set + g.Expect(pending.TrafficRoutedPercent).NotTo(BeNil()) + g.Expect(pending.TargetCapacity).NotTo(BeNil()) + g.Expect(active.TrafficRoutedPercent).NotTo(BeNil()) + g.Expect(active.TargetCapacity).NotTo(BeNil()) + + pendingTraffic := *pending.TrafficRoutedPercent + pendingCapacity := *pending.TargetCapacity + activeTraffic := *active.TrafficRoutedPercent + activeCapacity := *active.TargetCapacity + + LogWithTimestamp(test.T(), "pendingTraffic: %d, pendingCapacity: %d, activeTraffic: %d, activeCapacity: %d", pendingTraffic, pendingCapacity, activeTraffic, activeCapacity) + + // Initial iteration - set weights + if pendingTraffic == 0 && pendingCapacity == 0 && activeTraffic == 100 && activeCapacity == 100 { + lastPendingCapacity = pendingCapacity + lastPendingTraffic = pendingTraffic + lastActiveCapacity = activeCapacity + lastActiveTraffic = activeTraffic + continue + } - // Validate incremental upgrade completes - g.Eventually(func(g Gomega) { - rayService, err := GetRayService(test, namespace.Name, rayServiceName) - g.Expect(err).NotTo(HaveOccurred()) + // Validate that pending TargetCapacity increases by MaxSurgePercent + if pendingCapacity > lastPendingCapacity { + g.Expect(pendingCapacity - lastPendingCapacity).To(Equal(maxSurgeVal)) + lastPendingCapacity = pendingCapacity + } + + // Incremental traffic migration steps + if pendingTraffic < pendingCapacity { + if lastPendingTraffic != 0 { + g.Expect(pendingTraffic - lastPendingTraffic).To(Equal(stepSizeVal)) + g.Expect(lastActiveTraffic - activeTraffic).To(Equal(stepSizeVal)) + } + lastPendingTraffic = pendingTraffic + lastActiveTraffic = activeTraffic + continue + } + + // Once pending TrafficRoutedPercent equals TargetCapacity, active + // TargetCapacity can be reduced by MaxSurgePercent. + if pendingTraffic == pendingCapacity && activeCapacity > 0 { + rayService, err = GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + newActiveCapacity := *rayService.Status.ActiveServiceStatus.TargetCapacity + g.Expect(lastActiveCapacity - newActiveCapacity).To(Equal(maxSurgeVal)) + lastActiveCapacity = newActiveCapacity + continue + } + } + // Check that RayService completed upgrade + LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be false", rayService.Namespace, rayService.Name) + g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeFalse())) - g.Expect(rayService.Status.PendingServiceStatus.TrafficRoutedPercent).To(Equal(ptr.To(int32(100)))) - g.Expect(rayService.Status.ActiveServiceStatus.TrafficRoutedPercent).To(Equal(ptr.To(int32(0)))) - g.Expect(rayService.Status.PendingServiceStatus.TargetCapacity).To(Equal(ptr.To(int32(100)))) - g.Expect(rayService.Status.ActiveServiceStatus.TargetCapacity).To(Equal(ptr.To(int32(0)))) - }, TestTimeoutMedium).Should(Succeed()) + LogWithTimestamp(test.T(), "Verifying RayService uses updated ServeConfig after upgrade completes") + stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + g.Expect(stdout.String()).To(Equal("8")) } diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go index b35bdb83926..8cd59fb0df4 100644 --- a/ray-operator/test/e2eincrementalupgrade/support.go +++ b/ray-operator/test/e2eincrementalupgrade/support.go @@ -15,26 +15,6 @@ import ( . "github.com/ray-project/kuberay/ray-operator/test/support" ) -func CurlRayServiceHeadService( - t Test, - headSvcName string, - rayService *rayv1.RayService, - curlPod *corev1.Pod, - curlPodContainerName, - rayServicePath, - body string, -) (bytes.Buffer, bytes.Buffer) { - cmd := []string{ - "curl", - "-X", "POST", - "-H", "Content-Type: application/json", - fmt.Sprintf("%s.%s.svc.cluster.local:8000%s", headSvcName, rayService.Namespace, rayServicePath), - "-d", body, - } - - return ExecPodCmd(t, curlPod, curlPodContainerName, cmd) -} - func CurlRayServiceGateway( t Test, gatewayIP string, @@ -45,6 +25,7 @@ func CurlRayServiceGateway( ) (bytes.Buffer, bytes.Buffer) { cmd := []string{ "curl", + "--max-time", "10", "-X", "POST", "-H", "Content-Type: application/json", fmt.Sprintf("%s:80%s", gatewayIP, rayServicePath), @@ -140,8 +121,8 @@ func IncrementalUpgradeRayServiceApplyConfiguration( corev1.ResourceMemory: resource.MustParse("3Gi"), })))))). WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec(). - WithReplicas(0). - WithMinReplicas(0). + WithReplicas(1). + WithMinReplicas(1). WithMaxReplicas(4). WithRayStartParams(map[string]string{"num-cpus": "1"}). WithGroupName("small-group"). From 0c2e82cb2ebf23e241da214b674ee72a7fd570b8 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 4 Jun 2025 12:41:16 +0000 Subject: [PATCH 06/56] Add Makefile command Signed-off-by: Ryan O'Leary --- ray-operator/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ray-operator/Makefile b/ray-operator/Makefile index 83a2b6f5c4e..1ef2ad7e6db 100644 --- a/ray-operator/Makefile +++ b/ray-operator/Makefile @@ -80,7 +80,11 @@ test-e2e-rayservice: manifests fmt vet ## Run e2e RayService tests. go test -timeout 30m -v $(WHAT) test-e2e-upgrade: WHAT ?= ./test/e2eupgrade -test-e2e-upgrade: manifests fmt vet ## Run e2e tests. +test-e2e-upgrade: manifests fmt vet ## Run e2e operator upgrade tests. + go test -timeout 30m -v $(WHAT) + +test-e2e-incremental-upgrade: WHAT ?= ./test/e2eincrementalupgrade +test-e2e-incremental-upgrade: manifests fmt vet ## Run e2e RayService incremental upgrade tests. go test -timeout 30m -v $(WHAT) test-e2e-rayjob-submitter: WHAT ?= ./test/e2erayjobsubmitter From f72e6e8729f6cb31ecbe7310ce8ce9795f562518 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 4 Jun 2025 12:44:52 +0000 Subject: [PATCH 07/56] Run 'make sync' Signed-off-by: Ryan O'Leary --- .../crds/ray.io_rayservices.yaml | 3 + .../ray/v1/rayservicestatuses.go | 62 +++++++++---------- 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index 73e38364781..9cd7c7cc10b 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -8452,6 +8452,9 @@ spec: - type type: object type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map lastUpdateTime: format: date-time type: string diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go index fa38154c22c..7d0da98387f 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatuses.go @@ -4,20 +4,20 @@ package v1 import ( rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - applyconfigurationsmetav1 "k8s.io/client-go/applyconfigurations/meta/v1" + apismetav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + metav1 "k8s.io/client-go/applyconfigurations/meta/v1" ) // RayServiceStatusesApplyConfiguration represents a declarative configuration of the RayServiceStatuses type for use // with apply. type RayServiceStatusesApplyConfiguration struct { - LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"` - ServiceStatus *rayv1.ServiceStatus `json:"serviceStatus,omitempty"` - Conditions []applyconfigurationsmetav1.ConditionApplyConfiguration `json:"conditions,omitempty"` - ActiveServiceStatus *RayServiceStatusApplyConfiguration `json:"activeServiceStatus,omitempty"` - PendingServiceStatus *RayServiceStatusApplyConfiguration `json:"pendingServiceStatus,omitempty"` - ObservedGeneration *int64 `json:"observedGeneration,omitempty"` - NumServeEndpoints *int32 `json:"numServeEndpoints,omitempty"` + Conditions []metav1.ConditionApplyConfiguration `json:"conditions,omitempty"` + LastUpdateTime *apismetav1.Time `json:"lastUpdateTime,omitempty"` + ServiceStatus *rayv1.ServiceStatus `json:"serviceStatus,omitempty"` + ActiveServiceStatus *RayServiceStatusApplyConfiguration `json:"activeServiceStatus,omitempty"` + PendingServiceStatus *RayServiceStatusApplyConfiguration `json:"pendingServiceStatus,omitempty"` + NumServeEndpoints *int32 `json:"numServeEndpoints,omitempty"` + ObservedGeneration *int64 `json:"observedGeneration,omitempty"` } // RayServiceStatusesApplyConfiguration constructs a declarative configuration of the RayServiceStatuses type for use with @@ -26,10 +26,23 @@ func RayServiceStatuses() *RayServiceStatusesApplyConfiguration { return &RayServiceStatusesApplyConfiguration{} } +// WithConditions adds the given value to the Conditions field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Conditions field. +func (b *RayServiceStatusesApplyConfiguration) WithConditions(values ...*metav1.ConditionApplyConfiguration) *RayServiceStatusesApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithConditions") + } + b.Conditions = append(b.Conditions, *values[i]) + } + return b +} + // WithLastUpdateTime sets the LastUpdateTime field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the LastUpdateTime field is set to the value of the last call. -func (b *RayServiceStatusesApplyConfiguration) WithLastUpdateTime(value metav1.Time) *RayServiceStatusesApplyConfiguration { +func (b *RayServiceStatusesApplyConfiguration) WithLastUpdateTime(value apismetav1.Time) *RayServiceStatusesApplyConfiguration { b.LastUpdateTime = &value return b } @@ -42,19 +55,6 @@ func (b *RayServiceStatusesApplyConfiguration) WithServiceStatus(value rayv1.Ser return b } -// WithConditions adds the given value to the Conditions field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the Conditions field. -func (b *RayServiceStatusesApplyConfiguration) WithConditions(values ...*applyconfigurationsmetav1.ConditionApplyConfiguration) *RayServiceStatusesApplyConfiguration { - for i := range values { - if values[i] == nil { - panic("nil value passed to WithConditions") - } - b.Conditions = append(b.Conditions, *values[i]) - } - return b -} - // WithActiveServiceStatus sets the ActiveServiceStatus field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the ActiveServiceStatus field is set to the value of the last call. @@ -71,14 +71,6 @@ func (b *RayServiceStatusesApplyConfiguration) WithPendingServiceStatus(value *R return b } -// WithObservedGeneration sets the ObservedGeneration field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the ObservedGeneration field is set to the value of the last call. -func (b *RayServiceStatusesApplyConfiguration) WithObservedGeneration(value int64) *RayServiceStatusesApplyConfiguration { - b.ObservedGeneration = &value - return b -} - // WithNumServeEndpoints sets the NumServeEndpoints field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the NumServeEndpoints field is set to the value of the last call. @@ -86,3 +78,11 @@ func (b *RayServiceStatusesApplyConfiguration) WithNumServeEndpoints(value int32 b.NumServeEndpoints = &value return b } + +// WithObservedGeneration sets the ObservedGeneration field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ObservedGeneration field is set to the value of the last call. +func (b *RayServiceStatusesApplyConfiguration) WithObservedGeneration(value int64) *RayServiceStatusesApplyConfiguration { + b.ObservedGeneration = &value + return b +} From c25a3fe46648d4372db1adda17735316ee335b66 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 4 Jun 2025 12:47:59 +0000 Subject: [PATCH 08/56] Run 'make generate' Signed-off-by: Ryan O'Leary --- ray-operator/apis/ray/v1/zz_generated.deepcopy.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index a2ce6c8e1d8..80deee9db07 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -781,10 +781,6 @@ func (in *RayServiceStatus) DeepCopy() *RayServiceStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RayServiceStatuses) DeepCopyInto(out *RayServiceStatuses) { *out = *in - if in.LastUpdateTime != nil { - in, out := &in.LastUpdateTime, &out.LastUpdateTime - *out = (*in).DeepCopy() - } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) @@ -792,6 +788,10 @@ func (in *RayServiceStatuses) DeepCopyInto(out *RayServiceStatuses) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.LastUpdateTime != nil { + in, out := &in.LastUpdateTime, &out.LastUpdateTime + *out = (*in).DeepCopy() + } in.ActiveServiceStatus.DeepCopyInto(&out.ActiveServiceStatus) in.PendingServiceStatus.DeepCopyInto(&out.PendingServiceStatus) } From 5bd9ac1507f36d08f0352702eb8b70cf680bf00f Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 4 Jun 2025 19:22:22 +0000 Subject: [PATCH 09/56] Fix comments Signed-off-by: Ryan O'Leary --- ray-operator/apis/ray/v1/rayservice_types.go | 3 ++ .../controllers/ray/common/association.go | 2 ++ .../controllers/ray/rayservice_controller.go | 12 +++++-- ray-operator/controllers/ray/utils/util.go | 34 +++++++++++++++++++ 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index 2f8fdab4f3b..6616c71c443 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -78,6 +78,7 @@ type RayServiceUpgradeStrategy struct { // +optional Type *RayServiceUpgradeType `json:"type,omitempty"` // IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade. + // RayServiceIncrementalUpgrade feature gate must be enabled to set IncrementalUpgradeOptions. IncrementalUpgradeOptions *IncrementalUpgradeOptions `json:"incrementalUpgradeOptions,omitempty"` } @@ -105,8 +106,10 @@ type RayServiceSpec struct { // +optional ServeConfigV2 string `json:"serveConfigV2,omitempty"` // Gateway is the name of the Gateway object for the RayService to serve traffics during an IncrementalUpgrade. + // RayServiceIncrementalUpgrade feature gate must be enabled set the Gateway name. Gateway string `json:"gateway,omitempty"` // HTTPRoute is the name of the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade. + // RayServiceIncrementalUpgrade feature gate must be enabled to set the HTTPRoute name. HTTPRoute string `json:"httpRoute,omitempty"` RayClusterSpec RayClusterSpec `json:"rayClusterConfig"` // If the field is set to true, the value of the label `ray.io/serve` on the head Pod should always be false. diff --git a/ray-operator/controllers/ray/common/association.go b/ray-operator/controllers/ray/common/association.go index 7b60a29152a..50714cbe608 100644 --- a/ray-operator/controllers/ray/common/association.go +++ b/ray-operator/controllers/ray/common/association.go @@ -211,6 +211,7 @@ func RayServiceGatewayNamespacedName(rayService *rayv1.RayService) types.Namespa } else { gatewayName = fmt.Sprintf("%s-gateway", rayService.Name) } + gatewayName = utils.CheckGatewayName(gatewayName) return types.NamespacedName{ Name: gatewayName, Namespace: rayService.Namespace, @@ -224,6 +225,7 @@ func RayServiceHTTPRouteNamespacedName(rayService *rayv1.RayService) types.Names } else { httpRouteName = fmt.Sprintf("httproute-%s", rayService.Name) } + httpRouteName = utils.CheckHTTPRouteName(httpRouteName) return types.NamespacedName{ Name: httpRouteName, Namespace: rayService.Namespace, diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index e8c055bdef2..55a9847c91c 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -553,8 +553,13 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra if upgradeStrategy != nil { upgradeType := upgradeStrategy.Type if upgradeType != nil { - if *upgradeType != rayv1.NewCluster && *upgradeType != rayv1.IncrementalUpgrade { - logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to %s or %s.", string(rayv1.NewCluster), string(rayv1.IncrementalUpgrade)) + if features.Enabled(features.RayServiceIncrementalUpgrade) { + if *upgradeType != rayv1.NewCluster && *upgradeType != rayv1.IncrementalUpgrade { + logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to %s or %s.", string(rayv1.NewCluster), string(rayv1.IncrementalUpgrade)) + return false + } + } else if *upgradeType != rayv1.NewCluster { + logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to NewCluster.") return false } return true @@ -580,7 +585,7 @@ func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayServic } else { gatewayName = rayServiceInstance.Name + "-gateway" } - + gatewayName = utils.CheckGatewayName(gatewayName) // Define the desired Gateway object rayServiceGateway := &gwv1.Gateway{ ObjectMeta: metav1.ObjectMeta{ @@ -664,6 +669,7 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn } else { httpRouteName = fmt.Sprintf("httproute-%s", rayServiceInstance.Name) } + httpRouteName = utils.CheckHTTPRouteName(httpRouteName) desiredHTTPRoute := &gwv1.HTTPRoute{ ObjectMeta: metav1.ObjectMeta{ Name: httpRouteName, diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 825515d5582..95765b208b9 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -211,6 +211,40 @@ func CheckName(s string) string { return s } +func CheckGatewayName(name string) string { + const maxLength = 63 + + if len(name) > maxLength { + offset := len(name) - maxLength + fmt.Printf("Gateway name too long (len = %d), shortening by offset = %d", len(name), offset) + name = name[offset:] + } + + // Cannot start with a digit or punctuation + if len(name) > 0 && (unicode.IsDigit(rune(name[0])) || unicode.IsPunct(rune(name[0]))) { + name = "g" + name[1:] + } + + return name +} + +func CheckHTTPRouteName(name string) string { + const maxLength = 63 + + if len(name) > maxLength { + offset := len(name) - maxLength + fmt.Printf("HTTPRoute name too long (len = %d), shortening by offset = %d", len(name), offset) + name = name[offset:] + } + + // Cannot start with a digit or punctuation + if len(name) > 0 && (unicode.IsDigit(rune(name[0])) || unicode.IsPunct(rune(name[0]))) { + name = "h" + name[1:] + } + + return name +} + // TrimJobName uses CheckLabel to trim Kubernetes job to constrains func TrimJobName(jobName string) string { return CheckLabel(jobName) From 9551928fead5a154c3063396710ecb730252b0ed Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 4 Jun 2025 19:28:42 +0000 Subject: [PATCH 10/56] Run 'make api-docs' Signed-off-by: Ryan O'Leary --- docs/reference/api.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index 2706d2f2a87..2039c6e5e32 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -376,8 +376,8 @@ _Appears in:_ | `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | | | `upgradeStrategy` _[RayServiceUpgradeStrategy](#rayserviceupgradestrategy)_ | UpgradeStrategy defines the scaling policy used when upgrading the RayService. | | | | `serveConfigV2` _string_ | Important: Run "make" to regenerate code after modifying this file
Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. | | | -| `gateway` _string_ | Gateway is the name of the Gateway object for the RayService to serve traffics during an IncrementalUpgrade. | | | -| `httpRoute` _string_ | HTTPRoute is the name of the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade. | | | +| `gateway` _string_ | Gateway is the name of the Gateway object for the RayService to serve traffics during an IncrementalUpgrade.
RayServiceIncrementalUpgrade feature gate must be enabled set the Gateway name. | | | +| `httpRoute` _string_ | HTTPRoute is the name of the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade.
RayServiceIncrementalUpgrade feature gate must be enabled to set the HTTPRoute name. | | | | `rayClusterConfig` _[RayClusterSpec](#rayclusterspec)_ | | | | | `excludeHeadPodFromServeSvc` _boolean_ | If the field is set to true, the value of the label `ray.io/serve` on the head Pod should always be false.
Therefore, the head Pod's endpoint will not be added to the Kubernetes Serve service. | | | @@ -398,7 +398,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | | -| `incrementalUpgradeOptions` _[IncrementalUpgradeOptions](#incrementalupgradeoptions)_ | IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade. | | | +| `incrementalUpgradeOptions` _[IncrementalUpgradeOptions](#incrementalupgradeoptions)_ | IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade.
RayServiceIncrementalUpgrade feature gate must be enabled to set IncrementalUpgradeOptions. | | | #### RayServiceUpgradeType From 14e73c5a16ce1eeab7857c99c93e3665b6937bbf Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Tue, 16 Sep 2025 10:28:04 +0000 Subject: [PATCH 11/56] Fix tests after merge conflicts Signed-off-by: Ryan O'Leary --- go.mod | 10 +- go.sum | 8 +- .../controllers/ray/rayservice_controller.go | 115 +----------------- .../controllers/ray/utils/consistency.go | 17 +-- .../rayservice_incremental_upgrade_test.go | 5 +- 5 files changed, 23 insertions(+), 132 deletions(-) diff --git a/go.mod b/go.mod index 79ffcd37522..e93dc132eda 100644 --- a/go.mod +++ b/go.mod @@ -36,9 +36,6 @@ require ( k8s.io/kubectl v0.33.1 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 sigs.k8s.io/controller-runtime v0.21.0 - sigs.k8s.io/gateway-api v1.3.0 - sigs.k8s.io/structured-merge-diff/v4 v4.7.0 - sigs.k8s.io/controller-runtime v0.21.0 sigs.k8s.io/yaml v1.4.0 ) @@ -98,12 +95,12 @@ require ( go.uber.org/automaxprocs v1.6.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect - golang.org/x/net v0.38.0 // indirect + golang.org/x/net v0.39.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect golang.org/x/sync v0.13.0 // indirect golang.org/x/sys v0.32.0 // indirect - golang.org/x/term v0.30.0 // indirect - golang.org/x/text v0.23.0 // indirect + golang.org/x/term v0.31.0 // indirect + golang.org/x/text v0.24.0 // indirect golang.org/x/time v0.10.0 // indirect golang.org/x/tools v0.31.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect @@ -115,6 +112,7 @@ require ( k8s.io/component-base v0.33.1 // indirect k8s.io/component-helpers v0.33.1 // indirect k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect + sigs.k8s.io/gateway-api v1.3.0 // indirect sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect sigs.k8s.io/kustomize/api v0.19.0 // indirect sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect diff --git a/go.sum b/go.sum index 7b4c83540b9..22e4f1113d9 100644 --- a/go.sum +++ b/go.sum @@ -140,6 +140,8 @@ github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxec github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= @@ -295,8 +297,8 @@ golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= +golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= +golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= golang.org/x/time v0.10.0 h1:3usCWA8tQn0L8+hFJQNgzpWbd89begxN66o1Ojdn5L4= golang.org/x/time v0.10.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -379,6 +381,8 @@ k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= +sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M= +sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/kustomize/api v0.19.0 h1:F+2HB2mU1MSiR9Hp1NEgoU2q9ItNOaBJl0I4Dlus5SQ= diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 55a9847c91c..77701ed8179 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -6,6 +6,7 @@ import ( "fmt" "math" "os" + "reflect" "strconv" "strings" "time" @@ -394,118 +395,6 @@ func setCondition(rayServiceInstance *rayv1.RayService, conditionType rayv1.RayS meta.SetStatusCondition(&rayServiceInstance.Status.Conditions, condition) } -// Checks whether the old and new RayServiceStatus are inconsistent by comparing different fields. -// The RayClusterStatus field is only for observability in RayService CR, and changes to it will not trigger the status update. -func inconsistentRayServiceStatus(ctx context.Context, oldStatus rayv1.RayServiceStatus, newStatus rayv1.RayServiceStatus) bool { - logger := ctrl.LoggerFrom(ctx) - if oldStatus.RayClusterName != newStatus.RayClusterName { - logger.Info("inconsistentRayServiceStatus RayService RayClusterName", "oldRayClusterName", oldStatus.RayClusterName, "newRayClusterName", newStatus.RayClusterName) - return true - } - - if len(oldStatus.Applications) != len(newStatus.Applications) { - return true - } - - var ok bool - for appName, newAppStatus := range newStatus.Applications { - var oldAppStatus rayv1.AppStatus - if oldAppStatus, ok = oldStatus.Applications[appName]; !ok { - logger.Info("inconsistentRayServiceStatus RayService new application found", "appName", appName) - return true - } - - if oldAppStatus.Status != newAppStatus.Status { - logger.Info("inconsistentRayServiceStatus RayService application status changed", "appName", appName, "oldStatus", oldAppStatus.Status, "newStatus", newAppStatus.Status) - return true - } else if oldAppStatus.Message != newAppStatus.Message { - logger.Info("inconsistentRayServiceStatus RayService application status message changed", "appName", appName, "oldStatus", oldAppStatus.Message, "newStatus", newAppStatus.Message) - return true - } - - if len(oldAppStatus.Deployments) != len(newAppStatus.Deployments) { - return true - } - - for deploymentName, newDeploymentStatus := range newAppStatus.Deployments { - var oldDeploymentStatus rayv1.ServeDeploymentStatus - if oldDeploymentStatus, ok = oldAppStatus.Deployments[deploymentName]; !ok { - logger.Info("inconsistentRayServiceStatus RayService new deployment found in application", "deploymentName", deploymentName, "appName", appName) - return true - } - - if oldDeploymentStatus.Status != newDeploymentStatus.Status { - logger.Info("inconsistentRayServiceStatus RayService DeploymentStatus changed", "oldDeploymentStatus", oldDeploymentStatus.Status, "newDeploymentStatus", newDeploymentStatus.Status) - return true - } else if oldDeploymentStatus.Message != newDeploymentStatus.Message { - logger.Info("inconsistentRayServiceStatus RayService deployment status message changed", "oldDeploymentStatus", oldDeploymentStatus.Message, "newDeploymentStatus", newDeploymentStatus.Message) - return true - } - } - } - - if features.Enabled(features.RayServiceIncrementalUpgrade) { - // Also check for changes in IncrementalUpgrade related Status fields. - if (oldStatus.TrafficRoutedPercent == nil) != (newStatus.TrafficRoutedPercent == nil) || - (oldStatus.TrafficRoutedPercent != nil && newStatus.TrafficRoutedPercent != nil && - *oldStatus.TrafficRoutedPercent != *newStatus.TrafficRoutedPercent) { - logger.Info("inconsistentRayServiceStatus RayService updated TrafficRoutedPercent", - "old TrafficRoutedPercent", oldStatus.TrafficRoutedPercent, - "new TrafficRoutedPercent", newStatus.TrafficRoutedPercent) - return true - } - if (oldStatus.TargetCapacity == nil) != (newStatus.TargetCapacity == nil) || - (oldStatus.TargetCapacity != nil && newStatus.TargetCapacity != nil && - *oldStatus.TargetCapacity != *newStatus.TargetCapacity) { - logger.Info("inconsistentRayServiceStatus RayService updated TargetCapacity", - "old TargetCapacity", oldStatus.TargetCapacity, - "new TargetCapacity", newStatus.TargetCapacity) - return true - } - if (oldStatus.LastTrafficMigratedTime == nil) != (newStatus.LastTrafficMigratedTime == nil) || - (oldStatus.LastTrafficMigratedTime != nil && newStatus.LastTrafficMigratedTime != nil && - !oldStatus.LastTrafficMigratedTime.Equal(newStatus.LastTrafficMigratedTime)) { - logger.Info("inconsistentRayServiceStatus RayService updated LastTrafficMigratedTime", - "old LastTrafficMigratedTime", oldStatus.LastTrafficMigratedTime, - "new LastTrafficMigratedTime", newStatus.LastTrafficMigratedTime) - return true - } - } - - return false -} - -// Determine whether to update the status of the RayService instance. -func inconsistentRayServiceStatuses(ctx context.Context, oldStatus rayv1.RayServiceStatuses, newStatus rayv1.RayServiceStatuses) bool { - logger := ctrl.LoggerFrom(ctx) - if oldStatus.ServiceStatus != newStatus.ServiceStatus { - logger.Info("inconsistentRayServiceStatus RayService ServiceStatus changed", "oldServiceStatus", oldStatus.ServiceStatus, "newServiceStatus", newStatus.ServiceStatus) - return true - } - - if oldStatus.NumServeEndpoints != newStatus.NumServeEndpoints { - logger.Info("inconsistentRayServiceStatus RayService NumServeEndpoints changed", "oldNumServeEndpoints", oldStatus.NumServeEndpoints, "newNumServeEndpoints", newStatus.NumServeEndpoints) - return true - } - - if !reflect.DeepEqual(oldStatus.Conditions, newStatus.Conditions) { - logger.Info("inconsistentRayServiceStatus RayService Conditions changed") - return true - } - - if inconsistentRayServiceStatus(ctx, oldStatus.ActiveServiceStatus, newStatus.ActiveServiceStatus) { - logger.Info("inconsistentRayServiceStatus RayService ActiveServiceStatus changed") - return true - } - - if inconsistentRayServiceStatus(ctx, oldStatus.PendingServiceStatus, newStatus.PendingServiceStatus) { - logger.Info("inconsistentRayServiceStatus RayService PendingServiceStatus changed") - return true - } - - return false -} - // SetupWithManager sets up the controller with the Manager. func (r *RayServiceReconciler) SetupWithManager(mgr ctrl.Manager, reconcileConcurrency int) error { return ctrl.NewControllerManagedBy(mgr). @@ -1281,7 +1170,7 @@ func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.C // reconcileServeTargetCapacity reconciles the target_capacity of the ServeConfig for a given RayCluster during // an IncrementalUpgrade while also updating the Status.TargetCapacity of the Active and Pending RayServices. -func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient utils.RayDashboardClientInterface) error { +func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient dashboardclient.RayDashboardClientInterface) error { logger := ctrl.LoggerFrom(ctx) logger.Info("reconcileServeTargetCapacity", "RayService", rayServiceInstance.Name) diff --git a/ray-operator/controllers/ray/utils/consistency.go b/ray-operator/controllers/ray/utils/consistency.go index ba3d28824b0..4d04e9f5e3d 100644 --- a/ray-operator/controllers/ray/utils/consistency.go +++ b/ray-operator/controllers/ray/utils/consistency.go @@ -30,14 +30,6 @@ func InconsistentRayClusterStatus(oldStatus rayv1.RayClusterStatus, newStatus ra if !reflect.DeepEqual(oldStatus.Conditions, newStatus.Conditions) { return true } - if features.Enabled(features.RayServiceIncrementalUpgrade) { - // Also check for changes in IncrementalUpgrade related Status fields. - if oldStatus.TrafficRoutedPercent != newStatus.TrafficRoutedPercent || - oldStatus.TargetCapacity != newStatus.TargetCapacity || - oldStatus.LastTrafficMigratedTime != newStatus.LastTrafficMigratedTime { - return true - } - } return false } @@ -83,6 +75,15 @@ func inconsistentRayServiceStatus(oldStatus rayv1.RayServiceStatus, newStatus ra } } + if features.Enabled(features.RayServiceIncrementalUpgrade) { + // Also check for changes in IncrementalUpgrade related Status fields. + if oldStatus.TrafficRoutedPercent != newStatus.TrafficRoutedPercent || + oldStatus.TargetCapacity != newStatus.TargetCapacity || + oldStatus.LastTrafficMigratedTime != newStatus.LastTrafficMigratedTime { + return true + } + } + return false } diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go index ce3bb1f1757..8a4aa517cb5 100644 --- a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -15,7 +15,6 @@ import ( "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" "github.com/ray-project/kuberay/ray-operator/pkg/features" - "github.com/ray-project/kuberay/ray-operator/test/sampleyaml" . "github.com/ray-project/kuberay/ray-operator/test/support" ) @@ -81,7 +80,7 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { // Create curl pod to test traffic routing through Gateway to RayService curlPodName := "curl-pod" curlContainerName := "curl-container" - curlPod, err := CreateCurlPod(test, curlPodName, curlContainerName, namespace.Name) + curlPod, err := CreateCurlPod(g, test, curlPodName, curlContainerName, namespace.Name) g.Expect(err).NotTo(HaveOccurred()) LogWithTimestamp(test.T(), "Waiting for Curl Pod %s to be ready", curlPodName) @@ -89,7 +88,7 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { updatedPod, err := test.Client().Core().CoreV1().Pods(curlPod.Namespace).Get(test.Ctx(), curlPod.Name, metav1.GetOptions{}) g.Expect(err).NotTo(HaveOccurred()) return updatedPod - }, TestTimeoutShort).Should(WithTransform(sampleyaml.IsPodRunningAndReady, BeTrue())) + }, TestTimeoutShort).Should(WithTransform(IsPodRunningAndReady, BeTrue())) // Get the Gateway endpoint to send requests to gatewayIP := GetGatewayIP(gateway) From 424d4a0074a12dd49f1d8a00023c734b06acbf94 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Mon, 22 Sep 2025 23:23:54 -0700 Subject: [PATCH 12/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 77701ed8179..f71bdadc006 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -1135,8 +1135,8 @@ func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.C // Validate Gateway and HTTPRoute objects are ready gatewayInstance := &gwv1.Gateway{} if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil { - return false, "Failed to retrieve Gateway for RayService." - } + return false, fmt.Errorf("Failed to retrieve Gateway for RayService: %w", err) +} if !utils.IsGatewayReady(gatewayInstance) { return false, "Gateway for RayService IncrementalUpgrade is not ready." } From 9d6070b7d1323a5f9b3441ad599e722345801402 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Tue, 23 Sep 2025 03:02:21 -0700 Subject: [PATCH 13/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index f71bdadc006..1ad1708de17 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -533,6 +533,7 @@ func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceI existingGateway.Spec = desiredGateway.Spec if err := r.Update(ctx, existingGateway); err != nil { r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateGateway), "Failed to update the Gateway %s/%s: %v", existingGateway.Namespace, existingGateway.Name, err) + return nil, err } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedGateway), "Updated the Gateway %s/%s", existingGateway.Namespace, existingGateway.Name) } From 5ceae5038eec5167dba6058f9dc902b655b41145 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Tue, 23 Sep 2025 03:03:06 -0700 Subject: [PATCH 14/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 1ad1708de17..17ed5d42c2b 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -756,6 +756,7 @@ func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServic existingHTTPRoute.Spec = desiredHTTPRoute.Spec if err := r.Update(ctx, existingHTTPRoute); err != nil { r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateHTTPRoute), "Failed to update the HTTPRoute %s/%s: %v", existingHTTPRoute.Namespace, existingHTTPRoute.Name, err) + return nil, err } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedHTTPRoute), "Updated the HTTPRoute %s/%s", existingHTTPRoute.Namespace, existingHTTPRoute.Name) } From dc5018f09e0923a69a1a31e1a605328e8d35b709 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 29 Sep 2025 12:12:47 +0000 Subject: [PATCH 15/56] Fix error return Signed-off-by: Ryan O'Leary --- ray-operator/controllers/ray/rayservice_controller.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 17ed5d42c2b..91aa2de325f 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -1137,15 +1137,15 @@ func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.C // Validate Gateway and HTTPRoute objects are ready gatewayInstance := &gwv1.Gateway{} if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil { - return false, fmt.Errorf("Failed to retrieve Gateway for RayService: %w", err) -} + return false, fmt.Sprintf("Failed to retrieve Gateway for RayService: %v", err) + } if !utils.IsGatewayReady(gatewayInstance) { return false, "Gateway for RayService IncrementalUpgrade is not ready." } httpRouteInstance := &gwv1.HTTPRoute{} if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), httpRouteInstance); err != nil { - return false, "Failed to retrieve HTTPRoute for RayService." + return false, fmt.Sprintf("Failed to retrieve HTTPRoute for RayService: %v", err) } if !utils.IsHTTPRouteReady(gatewayInstance, httpRouteInstance) { return false, "HTTPRoute for RayService IncrementalUpgrade is not ready." From e7af14b1a743ea826b5d07af6a6d250e52bdbfcb Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 29 Sep 2025 12:13:31 +0000 Subject: [PATCH 16/56] Add RayServiceIncrementalUpgrade feature gate option to helm Signed-off-by: Ryan O'Leary --- helm-chart/kuberay-operator/values.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/helm-chart/kuberay-operator/values.yaml b/helm-chart/kuberay-operator/values.yaml index 91cc9b0f578..78b6ac6e592 100644 --- a/helm-chart/kuberay-operator/values.yaml +++ b/helm-chart/kuberay-operator/values.yaml @@ -117,6 +117,8 @@ featureGates: enabled: true - name: RayJobDeletionPolicy enabled: false +- name: RayServiceIncrementalUpgrade + enabled: true # Configurations for KubeRay operator metrics. metrics: From bdcd4014cd74b827c6d0b3b85e78f1fcbc1dae00 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 29 Sep 2025 12:31:20 +0000 Subject: [PATCH 17/56] Remove unnecessary perms Signed-off-by: Ryan O'Leary --- ray-operator/config/rbac/role.yaml | 3 --- ray-operator/controllers/ray/rayservice_controller.go | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ray-operator/config/rbac/role.yaml b/ray-operator/config/rbac/role.yaml index 3d0fc924b13..f3bc0b272ce 100644 --- a/ray-operator/config/rbac/role.yaml +++ b/ray-operator/config/rbac/role.yaml @@ -116,10 +116,7 @@ rules: - create - delete - get - - list - - patch - update - - watch - apiGroups: - networking.k8s.io resources: diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 91aa2de325f..a9aa5d882f1 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -93,8 +93,8 @@ func NewRayServiceReconciler(_ context.Context, mgr manager.Manager, provider ut // +kubebuilder:rbac:groups=core,resources=services/proxy,verbs=get;update;patch // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;create;update // +kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;watch;create;delete -// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=gateways,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=httproutes,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=gateways,verbs=get;create;update;delete +// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=httproutes,verbs=get;create;update;delete // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles,verbs=get;list;watch;create;delete;update // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=get;list;watch;create;delete From 0d145bce31201c83874d7cab6e3bf269c1936440 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 29 Sep 2025 12:37:06 +0000 Subject: [PATCH 18/56] Remove delete perm and run lint Signed-off-by: Ryan O'Leary --- helm-chart/kuberay-operator/README.md | 2 ++ ray-operator/config/rbac/role.yaml | 1 - ray-operator/controllers/ray/rayservice_controller.go | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/helm-chart/kuberay-operator/README.md b/helm-chart/kuberay-operator/README.md index fedcc71829b..179ab95cd0c 100644 --- a/helm-chart/kuberay-operator/README.md +++ b/helm-chart/kuberay-operator/README.md @@ -172,6 +172,8 @@ spec: | featureGates[0].enabled | bool | `true` | | | featureGates[1].name | string | `"RayJobDeletionPolicy"` | | | featureGates[1].enabled | bool | `false` | | +| featureGates[2].name | string | `"RayServiceIncrementalUpgrade"` | | +| featureGates[2].enabled | bool | `true` | | | metrics.enabled | bool | `true` | Whether KubeRay operator should emit control plane metrics. | | metrics.serviceMonitor.enabled | bool | `false` | Enable a prometheus ServiceMonitor | | metrics.serviceMonitor.interval | string | `"30s"` | Prometheus ServiceMonitor interval | diff --git a/ray-operator/config/rbac/role.yaml b/ray-operator/config/rbac/role.yaml index f3bc0b272ce..f6890110e07 100644 --- a/ray-operator/config/rbac/role.yaml +++ b/ray-operator/config/rbac/role.yaml @@ -114,7 +114,6 @@ rules: - httproutes verbs: - create - - delete - get - update - apiGroups: diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index a9aa5d882f1..9b07a069e2f 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -93,8 +93,8 @@ func NewRayServiceReconciler(_ context.Context, mgr manager.Manager, provider ut // +kubebuilder:rbac:groups=core,resources=services/proxy,verbs=get;update;patch // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;create;update // +kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;watch;create;delete -// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=gateways,verbs=get;create;update;delete -// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=httproutes,verbs=get;create;update;delete +// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=gateways,verbs=get;create;update; +// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=httproutes,verbs=get;create;update; // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles,verbs=get;list;watch;create;delete;update // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=get;list;watch;create;delete From ebbd2806e89015d9458d4bb98b1c3792b46d5aa5 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 29 Sep 2025 12:48:21 +0000 Subject: [PATCH 19/56] Fix helm roles Signed-off-by: Ryan O'Leary --- helm-chart/kuberay-operator/templates/_helpers.tpl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/helm-chart/kuberay-operator/templates/_helpers.tpl b/helm-chart/kuberay-operator/templates/_helpers.tpl index a827c92ad4b..143746b07d1 100644 --- a/helm-chart/kuberay-operator/templates/_helpers.tpl +++ b/helm-chart/kuberay-operator/templates/_helpers.tpl @@ -229,12 +229,8 @@ rules: - httproutes verbs: - create - - delete - get - - list - - patch - update - - watch - apiGroups: - networking.k8s.io resources: From 7ac33711ecd14c8e637dee673e07b97c90ba36fe Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Tue, 30 Sep 2025 01:03:49 +0000 Subject: [PATCH 20/56] add back required perms Signed-off-by: Ryan O'Leary --- helm-chart/kuberay-operator/templates/_helpers.tpl | 2 ++ ray-operator/config/rbac/role.yaml | 2 ++ ray-operator/controllers/ray/rayservice_controller.go | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/helm-chart/kuberay-operator/templates/_helpers.tpl b/helm-chart/kuberay-operator/templates/_helpers.tpl index 143746b07d1..49bbd71f490 100644 --- a/helm-chart/kuberay-operator/templates/_helpers.tpl +++ b/helm-chart/kuberay-operator/templates/_helpers.tpl @@ -230,6 +230,8 @@ rules: verbs: - create - get + - list + - watch - update - apiGroups: - networking.k8s.io diff --git a/ray-operator/config/rbac/role.yaml b/ray-operator/config/rbac/role.yaml index f6890110e07..9ea1db93190 100644 --- a/ray-operator/config/rbac/role.yaml +++ b/ray-operator/config/rbac/role.yaml @@ -115,7 +115,9 @@ rules: verbs: - create - get + - list - update + - watch - apiGroups: - networking.k8s.io resources: diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 9b07a069e2f..d2724d00cc1 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -93,8 +93,8 @@ func NewRayServiceReconciler(_ context.Context, mgr manager.Manager, provider ut // +kubebuilder:rbac:groups=core,resources=services/proxy,verbs=get;update;patch // +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;create;update // +kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;watch;create;delete -// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=gateways,verbs=get;create;update; -// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=httproutes,verbs=get;create;update; +// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=gateways,verbs=get;list;watch;create;update; +// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=httproutes,verbs=get;list;watch;create;update; // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles,verbs=get;list;watch;create;delete;update // +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=get;list;watch;create;delete From fd5a6573d3746cb10cfa691ba32e57d05d89cfcc Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Wed, 1 Oct 2025 02:08:07 -0700 Subject: [PATCH 21/56] Update ray-operator/controllers/ray/utils/validation.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/utils/validation.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 828b679ee70..bf7f8ab1aaa 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -338,7 +338,7 @@ func ValidateIncrementalUpgradeOptions(rayService *rayv1.RayService) error { return fmt.Errorf("IncrementalUpgradeOptions are required for IncrementalUpgrade") } - if options.MaxSurgePercent != nil && (*options.MaxSurgePercent < 0 || *options.MaxSurgePercent > 100) { + if *options.MaxSurgePercent < 0 || *options.MaxSurgePercent > 100 { return fmt.Errorf("maxSurgePercent must be between 0 and 100") } From a87fb1e90d1c1571098d0139024c132d17e848b8 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Wed, 1 Oct 2025 02:11:25 -0700 Subject: [PATCH 22/56] Update ray-operator/controllers/ray/utils/util.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/utils/util.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 95765b208b9..72f8ae9e57d 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -787,8 +787,7 @@ func GetGatewayListenersForRayService(rayServiceInstance *rayv1.RayService) []gw listener := gwv1.Listener{ Name: gwv1.SectionName(listenerName), Protocol: gwv1.HTTPProtocolType, // only support HTTP - Port: gwv1.PortNumber(int32(80)), // hardcoded to 80 for now - } + Port: utils.FindContainerPort(rayContainer, utils.ServingPortName, utils.DefaultServingPort) listeners = append(listeners, listener) return listeners From 5e092931124dc18de3ad96a02eacf0b72a56c901 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Wed, 1 Oct 2025 02:15:04 -0700 Subject: [PATCH 23/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index d2724d00cc1..c18eb1b1584 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -557,7 +557,7 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn if rayServiceInstance.Spec.HTTPRoute != "" { httpRouteName = rayServiceInstance.Spec.HTTPRoute } else { - httpRouteName = fmt.Sprintf("httproute-%s", rayServiceInstance.Name) + httpRouteName = fmt.Sprintf("httproute-%s", gatewayInstance.Name) } httpRouteName = utils.CheckHTTPRouteName(httpRouteName) desiredHTTPRoute := &gwv1.HTTPRoute{ From 3bc8ab5ef166583f330fcb4ffa49fa058310c4f9 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 1 Oct 2025 11:59:54 +0000 Subject: [PATCH 24/56] Change controller to use two serve services during upgrade Signed-off-by: Ryan O'Leary --- .../controllers/ray/common/service.go | 7 +- .../controllers/ray/rayservice_controller.go | 98 +++++-- .../ray/rayservice_controller_unit_test.go | 250 +++++++++++++----- .../rayservice_incremental_upgrade_test.go | 35 ++- 4 files changed, 304 insertions(+), 86 deletions(-) diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go index 71cea97c005..7675a30b3bb 100644 --- a/ray-operator/controllers/ray/common/service.go +++ b/ray-operator/controllers/ray/common/service.go @@ -184,7 +184,10 @@ func BuildServeService(ctx context.Context, rayService rayv1.RayService, rayClus namespace := rayCluster.Namespace crdType := utils.RayClusterCRD if isRayService { - name = rayService.Name + // For IncrementalUpgrade, the name is based on the unique RayCluster. + if !utils.IsIncrementalUpgradeEnabled(&rayService.Spec) { + name = rayService.Name + } namespace = rayService.Namespace crdType = utils.RayServiceCRD } @@ -225,7 +228,7 @@ func BuildServeService(ctx context.Context, rayService rayv1.RayService, rayClus "otherwise, the Kubernetes service for Ray Serve will not be created.") } - if rayService.Spec.ServeService != nil { + if rayService.Spec.ServeService != nil && !utils.IsIncrementalUpgradeEnabled(&rayService.Spec) { // Use the provided "custom" ServeService. // Deep copy the ServeService to avoid modifying the original object serveService := rayService.Spec.ServeService.DeepCopy() diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index c18eb1b1584..28b18b7efc5 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -149,7 +149,14 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // Check if IncrementalUpgrade is enabled, if so reconcile Gateway objects. if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { - // Creates a Gateway CR that points to the head services of + // Ensure per-cluster Serve service exists for the active and pending RayClusters. + if _, err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } + if _, err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, pendingRayClusterInstance); err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } + // Creates a Gateway CR that points to the Serve services of // the active and pending (if it exists) RayClusters. For incremental upgrades, // the Gateway endpoint is used rather than the Serve service. gateway, err := r.reconcileGateway(ctx, rayServiceInstance) @@ -330,7 +337,12 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn } serveEndPoints := &corev1.Endpoints{} - if err := r.Get(ctx, common.RayServiceServeServiceNamespacedName(rayServiceInstance), serveEndPoints); err != nil && !errors.IsNotFound(err) { + serveServiceName := common.RayServiceServeServiceNamespacedName(rayServiceInstance) + // For IncrementalUpgrade, the Serve service name is based on the RayCluster. + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && activeCluster != nil { + serveServiceName.Name = utils.GenerateServeServiceName(activeCluster.Name) + } + if err := r.Get(ctx, serveServiceName, serveEndPoints); err != nil && !errors.IsNotFound(err) { return err } @@ -343,6 +355,21 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn if numServeEndpoints > math.MaxInt32 { return errstd.New("numServeEndpoints exceeds math.MaxInt32") } + + // During an IncrementalUpgrade, the pending RayCluster is also serving. + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && pendingCluster != nil { + pendingServeServiceName := common.RayClusterServeServiceNamespacedName(pendingCluster) + if err := r.Get(ctx, pendingServeServiceName, serveEndPoints); err != nil && !errors.IsNotFound(err) { + return err + } + for _, subset := range serveEndPoints.Subsets { + numServeEndpoints += len(subset.Addresses) + } + if numServeEndpoints > math.MaxInt32 { + return errstd.New("numServeEndpoints exceeds math.MaxInt32") + } + } + rayServiceInstance.Status.NumServeEndpoints = int32(numServeEndpoints) //nolint:gosec // This is a false positive from gosec. See https://github.com/securego/gosec/issues/1212 for more details. calculateConditions(rayServiceInstance) @@ -583,20 +610,21 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn logger.Error(err, "Failed to retrieve active RayCluster") return nil, err } - if activeRayCluster == nil || activeRayCluster.Status.Head.ServiceName == "" { + if activeRayCluster == nil { logger.Info("Active RayCluster not found, skipping HTTPRoute creation.") return nil, nil } - oldClusterHeadSvcName := activeRayCluster.Status.Head.ServiceName - oldHeadSvc := &corev1.Service{} - if err := r.Get(ctx, client.ObjectKey{Name: oldClusterHeadSvcName, Namespace: rayServiceInstance.Namespace}, oldHeadSvc); err != nil { - logger.Error(err, "Failed to retrieve active RayCluster head service") + // Serve service points to the active RayCluster until the upgrade is complete. + oldClusterServeSvcName := utils.GenerateServeServiceName(activeRayCluster.Name) + oldServeSvc := &corev1.Service{} + if err := r.Get(ctx, client.ObjectKey{Name: oldClusterServeSvcName, Namespace: rayServiceInstance.Namespace}, oldServeSvc); err != nil { + logger.Error(err, "Failed to retrieve active RayCluster serve service.") return nil, err } // Attempt to retrieve pending RayCluster pendingRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServicePendingRayClusterNamespacedName(rayServiceInstance)) - hasPendingCluster := (err == nil && pendingRayCluster != nil && pendingRayCluster.Status.Head.ServiceName != "") + hasPendingCluster := (err == nil && pendingRayCluster != nil) if err != nil && !errors.IsNotFound(err) { logger.Info("Failed to retrieve pending RayCluster.") } @@ -607,16 +635,16 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn // Configure HTTPRoute to split traffic between active and pending clusters during an incremental upgrade if hasPendingCluster { - newClusterHeadSvcName := pendingRayCluster.Status.Head.ServiceName - newHeadSvc := &corev1.Service{} - if err := r.Get(ctx, client.ObjectKey{Name: newClusterHeadSvcName, Namespace: rayServiceInstance.Namespace}, newHeadSvc); err != nil { - logger.Error(err, "Failed to retrieve pending RayCluster head service") + newClusterServeSvcName := utils.GenerateServeServiceName(pendingRayCluster.Name) + newServeSvc := &corev1.Service{} + if err := r.Get(ctx, client.ObjectKey{Name: newClusterServeSvcName, Namespace: rayServiceInstance.Namespace}, newServeSvc); err != nil { + logger.Error(err, "Failed to retrieve pending RayCluster serve service.") return nil, err } options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) if options == nil { - return nil, errstd.New("Missing RayService IncrementalUpgradeOptions") + return nil, errstd.New("Missing RayService IncrementalUpgradeOptions.") } // Retrieve TrafficRoutedPercent for old and upgraded RayClusters. @@ -631,7 +659,7 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn if (newClusterWeight != nil && oldClusterWeight != nil) && (lastTrafficMigratedTime == nil || time.Since(lastTrafficMigratedTime.Time) >= intervalSeconds) { // Wait an initial iteration before migrating StepSizePercent. if lastTrafficMigratedTime != nil { - logger.Info("Updating cluster weights by StepSizePercent each") + logger.Info("Updating active and pending cluster weights each by StepSizePercent.") oldClusterWeight = ptr.To(max(*oldClusterWeight-*options.StepSizePercent, 0)) newClusterWeight = ptr.To(min(*newClusterWeight+*options.StepSizePercent, 100)) } @@ -662,7 +690,7 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn { BackendRef: gwv1.BackendRef{ BackendObjectReference: gwv1.BackendObjectReference{ - Name: gwv1.ObjectName(oldClusterHeadSvcName), + Name: gwv1.ObjectName(oldClusterServeSvcName), Namespace: ptr.To(gwv1.Namespace(rayServiceInstance.Namespace)), Port: ptr.To(gwv1.PortNumber(8000)), // set to Serve port }, @@ -672,7 +700,7 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn { BackendRef: gwv1.BackendRef{ BackendObjectReference: gwv1.BackendObjectReference{ - Name: gwv1.ObjectName(newClusterHeadSvcName), + Name: gwv1.ObjectName(newClusterServeSvcName), Namespace: ptr.To(gwv1.Namespace(rayServiceInstance.Namespace)), Port: ptr.To(gwv1.PortNumber(8000)), }, @@ -689,7 +717,7 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn { BackendRef: gwv1.BackendRef{ BackendObjectReference: gwv1.BackendObjectReference{ - Name: gwv1.ObjectName(oldClusterHeadSvcName), + Name: gwv1.ObjectName(oldClusterServeSvcName), Namespace: ptr.To(gwv1.Namespace(rayServiceInstance.Namespace)), Port: ptr.To(gwv1.PortNumber(8000)), }, @@ -1570,3 +1598,39 @@ func (r *RayServiceReconciler) isHeadPodRunningAndReady(ctx context.Context, ins } return utils.IsRunningAndReady(headPod), nil } + +// reconcilePerClusterServeService reconciles a load-balancing serve service for a given RayCluster. +func (r *RayServiceReconciler) reconcilePerClusterServeService(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster) (*corev1.Service, error) { + if rayClusterInstance == nil { + return nil, nil + } + + logger := ctrl.LoggerFrom(ctx).WithValues("RayCluster", rayClusterInstance.Name) + + logger.Info("Building per-cluster RayService") + + // Create a serve service for the RayCluster associated with this RayService. During an incremental + // upgrade, this will be called for the pending RayCluster instance. + desiredSvc, err := common.BuildServeService(ctx, *rayServiceInstance, *rayClusterInstance, true) + if err != nil { + logger.Error(err, "Failed to build per-cluster serve service spec") + return nil, err + } + if err := ctrl.SetControllerReference(rayClusterInstance, desiredSvc, r.Scheme); err != nil { + return nil, err + } + + existingSvc := &corev1.Service{} + err = r.Get(ctx, client.ObjectKey{Name: desiredSvc.Name, Namespace: desiredSvc.Namespace}, existingSvc) + if errors.IsNotFound(err) { + logger.Info("Creating new per-cluster serve service for incremental upgrade.", "Service", desiredSvc.Name) + if createErr := r.Create(ctx, desiredSvc); createErr != nil { + return nil, createErr + } + return desiredSvc, nil + } else if err != nil { + return nil, err + } + + return existingSvc, nil +} diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index f82a05f0576..cbd32cb57d9 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1451,67 +1451,55 @@ func TestCreateGateway(t *testing.T) { func TestCreateHTTPRoute(t *testing.T) { // Create re-used runtime objects for test cases - activeService := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: "active-service", - Namespace: "test-ns", - }, - } - pendingService := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pending-service", - Namespace: "test-ns", - }, - } + namespace := "test-ns" activeCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ Name: "active-ray-cluster", - Namespace: "test-ns", - }, - Status: rayv1.RayClusterStatus{ - Head: rayv1.HeadInfo{ - ServiceName: "active-service", - }, + Namespace: namespace, }, } pendingCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ Name: "pending-ray-cluster", - Namespace: "test-ns", - }, - Status: rayv1.RayClusterStatus{ - Head: rayv1.HeadInfo{ - ServiceName: "pending-service", - }, + Namespace: namespace, }, } gateway := &gwv1.Gateway{ ObjectMeta: metav1.ObjectMeta{ Name: "incremental-ray-service-gateway", - Namespace: "test-ns", + Namespace: namespace, + }, + } + activeServeService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: utils.GenerateServeServiceName(activeCluster.Name), + Namespace: namespace, + }, + } + pendingServeService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: utils.GenerateServeServiceName(pendingCluster.Name), + Namespace: namespace, }, } tests := []struct { - name string - rayService *rayv1.RayService - runtimeObjects []runtime.Object - routedPercent int32 - expectError bool + rayService *rayv1.RayService + name string + routedPercent int32 + expectError bool }{ { - name: "valid HTTPRoute creation", - routedPercent: int32(80), - rayService: makeIncrementalUpgradeRayService(true, "gateway-class", ptr.To(int32(50)), ptr.To(int32(1000)), ptr.To(int32(80)), &metav1.Time{Time: time.Now()}), - runtimeObjects: []runtime.Object{activeService, pendingService, pendingCluster, activeCluster, gateway}, - expectError: false, + name: "valid HTTPRoute creation", + routedPercent: int32(80), + rayService: makeIncrementalUpgradeRayService(true, "gateway-class", ptr.To(int32(50)), ptr.To(int32(1000)), ptr.To(int32(80)), &metav1.Time{Time: time.Now()}), + expectError: false, }, { - name: "missing IncrementalUpgradeOptions", - routedPercent: int32(50), - rayService: makeIncrementalUpgradeRayService(false, "gateway-class", ptr.To(int32(50)), ptr.To(int32(120)), ptr.To(int32(50)), &metav1.Time{Time: time.Now()}), - runtimeObjects: []runtime.Object{activeService, pendingService, pendingCluster, activeCluster, gateway}, - expectError: true, + name: "missing IncrementalUpgradeOptions", + routedPercent: int32(50), + rayService: makeIncrementalUpgradeRayService(false, "gateway-class", ptr.To(int32(50)), ptr.To(int32(120)), ptr.To(int32(50)), &metav1.Time{Time: time.Now()}), + expectError: true, }, } @@ -1522,7 +1510,13 @@ func TestCreateHTTPRoute(t *testing.T) { _ = rayv1.AddToScheme(newScheme) _ = gwv1.AddToScheme(newScheme) - fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build() + // Setup runtime test objects. + runtimeObjects := []runtime.Object{ + tt.rayService, activeServeService, pendingServeService, + pendingCluster, activeCluster, gateway, + } + + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() reconciler := RayServiceReconciler{ Client: fakeClient, Scheme: newScheme, @@ -1545,8 +1539,8 @@ func TestCreateHTTPRoute(t *testing.T) { rule := route.Spec.Rules[0] require.Len(t, rule.BackendRefs, 2) - assert.Equal(t, gwv1.ObjectName("active-service"), rule.BackendRefs[0].BackendRef.Name) - assert.Equal(t, gwv1.ObjectName("pending-service"), rule.BackendRefs[1].BackendRef.Name) + assert.Equal(t, gwv1.ObjectName(activeServeService.Name), rule.BackendRefs[0].BackendRef.Name) + assert.Equal(t, gwv1.ObjectName(pendingServeService.Name), rule.BackendRefs[1].BackendRef.Name) assert.Equal(t, tt.routedPercent, *rule.BackendRefs[0].Weight) assert.Equal(t, int32(100)-tt.routedPercent, *rule.BackendRefs[1].Weight) @@ -1573,39 +1567,29 @@ func TestReconcileHTTPRoute(t *testing.T) { ptr.To(metav1.Now()), ) - activeService := &corev1.Service{ + activeCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ - Name: "active-service", + Name: "active-ray-cluster", Namespace: namespace, }, } - pendingService := &corev1.Service{ + pendingCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ - Name: "pending-service", + Name: "pending-ray-cluster", Namespace: namespace, }, } - activeCluster := &rayv1.RayCluster{ + activeServeService := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ - Name: "active-ray-cluster", + Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace, }, - Status: rayv1.RayClusterStatus{ - Head: rayv1.HeadInfo{ - ServiceName: "active-service", - }, - }, } - pendingCluster := &rayv1.RayCluster{ + pendingServeService := &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ - Name: "pending-ray-cluster", + Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace, }, - Status: rayv1.RayClusterStatus{ - Head: rayv1.HeadInfo{ - ServiceName: "pending-service", - }, - }, } gateway := &gwv1.Gateway{ ObjectMeta: metav1.ObjectMeta{ @@ -1628,7 +1612,7 @@ func TestReconcileHTTPRoute(t *testing.T) { { BackendRef: gwv1.BackendRef{ BackendObjectReference: gwv1.BackendObjectReference{ - Name: "active-service", + Name: gwv1.ObjectName(activeServeService.Name), Namespace: ptr.To(gwv1.Namespace(namespace)), Port: ptr.To(gwv1.PortNumber(8000)), }, @@ -1638,7 +1622,7 @@ func TestReconcileHTTPRoute(t *testing.T) { { BackendRef: gwv1.BackendRef{ BackendObjectReference: gwv1.BackendObjectReference{ - Name: "pending-service", + Name: gwv1.ObjectName(pendingServeService.Name), Namespace: ptr.To(gwv1.Namespace(namespace)), Port: ptr.To(gwv1.PortNumber(8000)), }, @@ -1658,7 +1642,7 @@ func TestReconcileHTTPRoute(t *testing.T) { { name: "creates new HTTPRoute if not present", runtimeObjects: []runtime.Object{ - rayService, activeService, pendingService, + rayService, activeServeService, pendingServeService, activeCluster, pendingCluster, gateway, }, expectedRouteName: "httproute-incremental-ray-service", @@ -1667,7 +1651,7 @@ func TestReconcileHTTPRoute(t *testing.T) { { name: "updates HTTPRoute if spec differs", runtimeObjects: []runtime.Object{ - rayService, activeService, pendingService, + rayService, activeServeService, pendingServeService, activeCluster, pendingCluster, gateway, existingHTTPRoute, }, @@ -1696,6 +1680,9 @@ func TestReconcileHTTPRoute(t *testing.T) { assert.Equal(t, tt.expectedRouteName, route.Name) assert.Equal(t, namespace, route.Namespace) + assert.Equal(t, gwv1.ObjectName(activeServeService.Name), route.Spec.Rules[0].BackendRefs[0].Name) + assert.Equal(t, gwv1.ObjectName(pendingServeService.Name), route.Spec.Rules[0].BackendRefs[1].Name) + require.Len(t, route.Spec.Rules[0].BackendRefs, 2) assert.Equal(t, tt.expectedWeight, *route.Spec.Rules[0].BackendRefs[0].Weight) assert.Equal(t, 100-tt.expectedWeight, *route.Spec.Rules[0].BackendRefs[1].Weight) @@ -2031,3 +2018,136 @@ func TestCheckIfNeedIncrementalUpgradeUpdate(t *testing.T) { }) } } + +func TestReconcilePerClusterServeService(t *testing.T) { + features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true) + + ctx := context.TODO() + namespace := "test-ns" + + // Minimal RayCluster with at least one container. + rayCluster := &rayv1.RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ray-cluster", + Namespace: namespace, + UID: "test-uid", + }, + Spec: rayv1.RayClusterSpec{ + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "ray-head"}, + }, + }, + }, + }, + }, + } + rayService := makeIncrementalUpgradeRayService( + true, + "gateway-class", + ptr.To(int32(20)), + ptr.To(int32(30)), + ptr.To(int32(80)), + ptr.To(metav1.Now()), + ) + + // The expected pending RayCluster serve service. + expectedServeSvcName := utils.GenerateServeServiceName(rayCluster.Name) + expectedServeService := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: expectedServeSvcName, + Namespace: namespace, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{ + utils.RayClusterLabelKey: rayCluster.Name, + utils.RayClusterServingServiceLabelKey: "true", + }, + }, + } + + tests := []struct { + name string + rayCluster *rayv1.RayCluster + runtimeObjects []runtime.Object + expectServiceCreated bool + expectError bool + }{ + { + name: "RayCluster is nil, no-op.", + rayCluster: nil, + runtimeObjects: []runtime.Object{rayService}, + expectServiceCreated: false, + expectError: false, + }, + { + name: "Create a new Serve service for the RayCluster.", + rayCluster: rayCluster, + runtimeObjects: []runtime.Object{rayService, rayCluster}, + expectServiceCreated: true, + expectError: false, + }, + { + name: "Pending RayCluster serve service already exists, no-op.", + rayCluster: rayCluster, + runtimeObjects: []runtime.Object{rayService, rayCluster, expectedServeService}, + expectServiceCreated: false, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + newScheme := runtime.NewScheme() + _ = rayv1.AddToScheme(newScheme) + _ = corev1.AddToScheme(newScheme) + + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build() + reconciler := RayServiceReconciler{ + Client: fakeClient, + Scheme: newScheme, + Recorder: record.NewFakeRecorder(1), + } + + reconciledSvc, err := reconciler.reconcilePerClusterServeService(ctx, rayService, tt.rayCluster) + + if tt.expectError { + require.Error(t, err) + return + } + require.NoError(t, err) + + // No-op case, no service should be created when RayCluster is nil. + if tt.rayCluster == nil { + assert.Nil(t, reconciledSvc) + return + } + + // Validate the expected Serve service exists for the RayCluster. + require.NotNil(t, reconciledSvc) + assert.Equal(t, expectedServeSvcName, reconciledSvc.Name) + + createdSvc := &corev1.Service{} + err = fakeClient.Get(ctx, client.ObjectKey{Name: expectedServeSvcName, Namespace: namespace}, createdSvc) + require.NoError(t, err, "The Serve service should exist in the client") + + // Verify the Serve service selector. + expectedSelector := map[string]string{ + utils.RayClusterLabelKey: rayCluster.Name, + utils.RayClusterServingServiceLabelKey: "true", + } + assert.Equal(t, expectedSelector, createdSvc.Spec.Selector) + + // Validate owner ref is set to the expected RayCluster. + if tt.expectServiceCreated { + require.Len(t, createdSvc.OwnerReferences, 1) + ownerRef := createdSvc.OwnerReferences[0] + assert.Equal(t, rayCluster.Name, ownerRef.Name) + assert.Equal(t, "RayCluster", ownerRef.Kind) + assert.Equal(t, rayCluster.UID, ownerRef.UID) + } + }) + } +} diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go index 8a4aa517cb5..04c0cff111c 100644 --- a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -104,7 +104,7 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { rayService, err = GetRayService(test, namespace.Name, rayService.Name) g.Expect(err).NotTo(HaveOccurred()) - rayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests["CPU"] = resource.MustParse("500m") + rayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceCPU] = resource.MustParse("500m") serveConfig := rayService.Spec.ServeConfigV2 serveConfig = strings.Replace(serveConfig, "price: 3", "price: 4", -1) serveConfig = strings.Replace(serveConfig, "factor: 5", "factor: 3", -1) @@ -119,6 +119,36 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be true", rayService.Namespace, rayService.Name) g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeTrue())) + LogWithTimestamp(test.T(), "Verifying temporary service creation and HTTPRoute backends") + upgradingRaySvc, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + activeClusterName := upgradingRaySvc.Status.ActiveServiceStatus.RayClusterName + g.Expect(activeClusterName).NotTo(BeEmpty(), "The active cluster should be set when a RayService is ready.") + pendingClusterName := upgradingRaySvc.Status.PendingServiceStatus.RayClusterName + g.Expect(pendingClusterName).NotTo(BeEmpty(), "The controller should have created a pending cluster.") + + // Validate serve service for the active cluster exists. + activeServeSvcName := utils.GenerateServeServiceName(activeClusterName) + _, err = test.Client().Core().CoreV1().Services(namespace.Name).Get(test.Ctx(), activeServeSvcName, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred(), "The serve service for the active cluster should be created.") + + // Validate serve service for the pending cluster has been created for the upgrade. + pendingServeSvcName := utils.GenerateServeServiceName(pendingClusterName) + g.Eventually(func(g Gomega) { + _, err = test.Client().Core().CoreV1().Services(namespace.Name).Get(test.Ctx(), pendingServeSvcName, metav1.GetOptions{}) + g.Expect(err).NotTo(HaveOccurred(), "The serve service for the pending cluster should be created.") + }, TestTimeoutShort).Should(Succeed()) + + // Verify HTTPRoute is pointing to the correct two backends. + g.Eventually(func(g Gomega) { + route, err := GetHTTPRoute(test, namespace.Name, httpRouteName) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(route.Spec.Rules).To(HaveLen(1)) + g.Expect(route.Spec.Rules[0].BackendRefs).To(HaveLen(2)) + g.Expect(string(route.Spec.Rules[0].BackendRefs[0].Name)).To(Equal(activeServeSvcName)) + g.Expect(string(route.Spec.Rules[0].BackendRefs[1].Name)).To(Equal(pendingServeSvcName)) + }, TestTimeoutShort).Should(Succeed()) + LogWithTimestamp(test.T(), "Validating stepwise traffic and capacity migration") stepSizeVal := *stepSize intervalVal := *interval @@ -126,7 +156,8 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { var lastPendingCapacity, lastPendingTraffic, lastActiveCapacity, lastActiveTraffic int32 - // Validate expected behavior during IncrementalUpgrade + // Validate expected behavior during an IncrementalUpgrade. The following checks ensures + // that no requests are dropped throughout the upgrade process. for { // Wait IntervalSeconds in between updates time.Sleep(time.Duration(intervalVal) * time.Second) From df4e4fd286a34e688e119b447d6b7d80da12ccf9 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 1 Oct 2025 13:26:35 +0000 Subject: [PATCH 25/56] Remove Gateway and HTTPRoute API fields Signed-off-by: Ryan O'Leary --- docs/reference/api.md | 2 - .../crds/ray.io_rayservices.yaml | 4 - ray-operator/apis/ray/v1/rayservice_types.go | 8 +- .../config/crd/bases/ray.io_rayservices.yaml | 4 - .../controllers/ray/common/association.go | 16 +--- .../controllers/ray/rayservice_controller.go | 85 +++++++------------ .../ray/rayservice_controller_unit_test.go | 49 +++++++---- .../ray/v1/rayservicespec.go | 18 ---- 8 files changed, 65 insertions(+), 121 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index 2039c6e5e32..305941e8c2f 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -376,8 +376,6 @@ _Appears in:_ | `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | | | `upgradeStrategy` _[RayServiceUpgradeStrategy](#rayserviceupgradestrategy)_ | UpgradeStrategy defines the scaling policy used when upgrading the RayService. | | | | `serveConfigV2` _string_ | Important: Run "make" to regenerate code after modifying this file
Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. | | | -| `gateway` _string_ | Gateway is the name of the Gateway object for the RayService to serve traffics during an IncrementalUpgrade.
RayServiceIncrementalUpgrade feature gate must be enabled set the Gateway name. | | | -| `httpRoute` _string_ | HTTPRoute is the name of the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade.
RayServiceIncrementalUpgrade feature gate must be enabled to set the HTTPRoute name. | | | | `rayClusterConfig` _[RayClusterSpec](#rayclusterspec)_ | | | | | `excludeHeadPodFromServeSvc` _boolean_ | If the field is set to true, the value of the label `ray.io/serve` on the head Pod should always be false.
Therefore, the head Pod's endpoint will not be added to the Kubernetes Serve service. | | | diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index 9cd7c7cc10b..4dd8e3ea48e 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -40,10 +40,6 @@ spec: type: integer excludeHeadPodFromServeSvc: type: boolean - gateway: - type: string - httpRoute: - type: string rayClusterConfig: properties: autoscalerOptions: diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index 6616c71c443..ef11b80d2d6 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -104,13 +104,7 @@ type RayServiceSpec struct { // Important: Run "make" to regenerate code after modifying this file // Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. // +optional - ServeConfigV2 string `json:"serveConfigV2,omitempty"` - // Gateway is the name of the Gateway object for the RayService to serve traffics during an IncrementalUpgrade. - // RayServiceIncrementalUpgrade feature gate must be enabled set the Gateway name. - Gateway string `json:"gateway,omitempty"` - // HTTPRoute is the name of the HTTPRoute object for the RayService to split traffics during an IncrementalUpgrade. - // RayServiceIncrementalUpgrade feature gate must be enabled to set the HTTPRoute name. - HTTPRoute string `json:"httpRoute,omitempty"` + ServeConfigV2 string `json:"serveConfigV2,omitempty"` RayClusterSpec RayClusterSpec `json:"rayClusterConfig"` // If the field is set to true, the value of the label `ray.io/serve` on the head Pod should always be false. // Therefore, the head Pod's endpoint will not be added to the Kubernetes Serve service. diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index 9cd7c7cc10b..4dd8e3ea48e 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -40,10 +40,6 @@ spec: type: integer excludeHeadPodFromServeSvc: type: boolean - gateway: - type: string - httpRoute: - type: string rayClusterConfig: properties: autoscalerOptions: diff --git a/ray-operator/controllers/ray/common/association.go b/ray-operator/controllers/ray/common/association.go index 50714cbe608..f053e8da583 100644 --- a/ray-operator/controllers/ray/common/association.go +++ b/ray-operator/controllers/ray/common/association.go @@ -205,13 +205,7 @@ func RayClusterNetworkResourcesOptions(instance *rayv1.RayCluster) AssociationOp } func RayServiceGatewayNamespacedName(rayService *rayv1.RayService) types.NamespacedName { - var gatewayName string - if rayService.Spec.Gateway != "" { - gatewayName = rayService.Spec.Gateway - } else { - gatewayName = fmt.Sprintf("%s-gateway", rayService.Name) - } - gatewayName = utils.CheckGatewayName(gatewayName) + gatewayName := utils.CheckGatewayName(fmt.Sprintf("%s-gateway", rayService.Name)) return types.NamespacedName{ Name: gatewayName, Namespace: rayService.Namespace, @@ -219,13 +213,7 @@ func RayServiceGatewayNamespacedName(rayService *rayv1.RayService) types.Namespa } func RayServiceHTTPRouteNamespacedName(rayService *rayv1.RayService) types.NamespacedName { - var httpRouteName string - if rayService.Spec.HTTPRoute != "" { - httpRouteName = rayService.Spec.HTTPRoute - } else { - httpRouteName = fmt.Sprintf("httproute-%s", rayService.Name) - } - httpRouteName = utils.CheckHTTPRouteName(httpRouteName) + httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s", rayService.Name)) return types.NamespacedName{ Name: httpRouteName, Namespace: rayService.Namespace, diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 28b18b7efc5..a341ee21ba3 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -150,30 +150,24 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // Check if IncrementalUpgrade is enabled, if so reconcile Gateway objects. if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { // Ensure per-cluster Serve service exists for the active and pending RayClusters. - if _, err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { + if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } - if _, err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, pendingRayClusterInstance); err != nil { + if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, pendingRayClusterInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } - // Creates a Gateway CR that points to the Serve services of + // Creates or updates a Gateway CR that points to the Serve services of // the active and pending (if it exists) RayClusters. For incremental upgrades, // the Gateway endpoint is used rather than the Serve service. - gateway, err := r.reconcileGateway(ctx, rayServiceInstance) + err = r.reconcileGateway(ctx, rayServiceInstance) if err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) } - if gateway != nil { - rayServiceInstance.Spec.Gateway = gateway.Name - } // Create or update the HTTPRoute attached to this RayService's Gateway - httpRoute, err := r.reconcileHTTPRoute(ctx, rayServiceInstance) + err = r.reconcileHTTPRoute(ctx, rayServiceInstance) if err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) } - if httpRoute != nil { - rayServiceInstance.Spec.HTTPRoute = httpRoute.Name - } } // Reconcile serve applications for active and/or pending clusters @@ -495,13 +489,7 @@ func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayServic return nil, errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade") } - var gatewayName string - if rayServiceInstance.Spec.Gateway != "" { - gatewayName = rayServiceInstance.Spec.Gateway - } else { - gatewayName = rayServiceInstance.Name + "-gateway" - } - gatewayName = utils.CheckGatewayName(gatewayName) + gatewayName := utils.CheckGatewayName(rayServiceInstance.Name + "-gateway") // Define the desired Gateway object rayServiceGateway := &gwv1.Gateway{ ObjectMeta: metav1.ObjectMeta{ @@ -520,7 +508,7 @@ func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayServic // `reconcileGateway` reconciles a Gateway resource for a RayService. The possible cases are: // (1) Create a new Gateway instance. (2) Update the Gateway instance if RayService has updated. (3) Do nothing. -func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.Gateway, error) { +func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceInstance *rayv1.RayService) error { logger := ctrl.LoggerFrom(ctx) var err error @@ -528,11 +516,11 @@ func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceI desiredGateway, err := r.createGateway(rayServiceInstance) if err != nil { logger.Error(err, "Failed to build Gateway object for Rayservice") - return nil, err + return err } if desiredGateway == nil { logger.Info("Skipping Gateway reconciliation: desired Gateway is nil") - return nil, nil + return nil } // Check for existing RayService Gateway, create the desired Gateway if none is found @@ -541,17 +529,17 @@ func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceI if errors.IsNotFound(err) { // Set the ownership in order to do the garbage collection by k8s. if err := ctrl.SetControllerReference(rayServiceInstance, desiredGateway, r.Scheme); err != nil { - return nil, err + return err } logger.Info("Creating a new Gateway instance", "Gateway Listeners", desiredGateway.Spec.Listeners) if err := r.Create(ctx, desiredGateway); err != nil { r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateGateway), "Failed to create Gateway for RayService %s/%s: %v", desiredGateway.Namespace, desiredGateway.Name, err) - return nil, err + return err } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedRayCluster), "Created Gateway for RayService %s/%s", desiredGateway.Namespace, desiredGateway.Name) - return desiredGateway, nil + return nil } - return nil, err + return err } // If Gateway already exists, check if update is needed to reach desired state @@ -560,12 +548,12 @@ func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceI existingGateway.Spec = desiredGateway.Spec if err := r.Update(ctx, existingGateway); err != nil { r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateGateway), "Failed to update the Gateway %s/%s: %v", existingGateway.Namespace, existingGateway.Name, err) - return nil, err + return err } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedGateway), "Updated the Gateway %s/%s", existingGateway.Namespace, existingGateway.Name) } - return existingGateway, nil + return nil } // createHTTPRoute creates a desired HTTPRoute object based on a given RayService instance with @@ -580,13 +568,7 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn } // Define the desired HTTPRoute name and basic object - var httpRouteName string - if rayServiceInstance.Spec.HTTPRoute != "" { - httpRouteName = rayServiceInstance.Spec.HTTPRoute - } else { - httpRouteName = fmt.Sprintf("httproute-%s", gatewayInstance.Name) - } - httpRouteName = utils.CheckHTTPRouteName(httpRouteName) + httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s", gatewayInstance.Name)) desiredHTTPRoute := &gwv1.HTTPRoute{ ObjectMeta: metav1.ObjectMeta{ Name: httpRouteName, @@ -746,18 +728,18 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn } // reconcileHTTPRoute reconciles a HTTPRoute resource for a RayService to route traffic during an IncrementalUpgrade. -func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.HTTPRoute, error) { +func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) error { logger := ctrl.LoggerFrom(ctx) var err error desiredHTTPRoute, err := r.createHTTPRoute(ctx, rayServiceInstance) if err != nil { logger.Error(err, "Failed to build HTTPRoute for RayService upgrade") - return nil, err + return err } if desiredHTTPRoute == nil { logger.Info("Skipping HTTPRoute reconciliation: desired HTTPRoute is nil") - return nil, nil + return nil } // Check for existing HTTPRoute for RayService @@ -766,16 +748,16 @@ func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServic if errors.IsNotFound(err) { // Set the ownership in order to do the garbage collection by k8s. if err := ctrl.SetControllerReference(rayServiceInstance, desiredHTTPRoute, r.Scheme); err != nil { - return nil, err + return err } if err = r.Create(ctx, desiredHTTPRoute); err != nil { r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateHTTPRoute), "Failed to create the HTTPRoute for RayService %s/%s: %v", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name, err) - return nil, err + return err } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.FailedToCreateHTTPRoute), "Created HTTPRoute for RayService %s/%s", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name) - return desiredHTTPRoute, nil + return nil } - return nil, err + return err } // If HTTPRoute already exists, check if update is needed @@ -784,12 +766,12 @@ func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServic existingHTTPRoute.Spec = desiredHTTPRoute.Spec if err := r.Update(ctx, existingHTTPRoute); err != nil { r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateHTTPRoute), "Failed to update the HTTPRoute %s/%s: %v", existingHTTPRoute.Namespace, existingHTTPRoute.Name, err) - return nil, err + return err } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedHTTPRoute), "Updated the HTTPRoute %s/%s", existingHTTPRoute.Namespace, existingHTTPRoute.Name) } - return existingHTTPRoute, nil + return nil } // `reconcileRayCluster` reconciles the active and pending Ray clusters. There are 4 possible cases: @@ -1600,9 +1582,9 @@ func (r *RayServiceReconciler) isHeadPodRunningAndReady(ctx context.Context, ins } // reconcilePerClusterServeService reconciles a load-balancing serve service for a given RayCluster. -func (r *RayServiceReconciler) reconcilePerClusterServeService(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster) (*corev1.Service, error) { +func (r *RayServiceReconciler) reconcilePerClusterServeService(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster) error { if rayClusterInstance == nil { - return nil, nil + return nil } logger := ctrl.LoggerFrom(ctx).WithValues("RayCluster", rayClusterInstance.Name) @@ -1614,23 +1596,18 @@ func (r *RayServiceReconciler) reconcilePerClusterServeService(ctx context.Conte desiredSvc, err := common.BuildServeService(ctx, *rayServiceInstance, *rayClusterInstance, true) if err != nil { logger.Error(err, "Failed to build per-cluster serve service spec") - return nil, err + return err } if err := ctrl.SetControllerReference(rayClusterInstance, desiredSvc, r.Scheme); err != nil { - return nil, err + return err } existingSvc := &corev1.Service{} err = r.Get(ctx, client.ObjectKey{Name: desiredSvc.Name, Namespace: desiredSvc.Namespace}, existingSvc) if errors.IsNotFound(err) { logger.Info("Creating new per-cluster serve service for incremental upgrade.", "Service", desiredSvc.Name) - if createErr := r.Create(ctx, desiredSvc); createErr != nil { - return nil, createErr - } - return desiredSvc, nil - } else if err != nil { - return nil, err + return r.Create(ctx, desiredSvc) } - return existingSvc, nil + return err } diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index cbd32cb57d9..693e26ee8d5 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -13,6 +13,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -1673,21 +1674,24 @@ func TestReconcileHTTPRoute(t *testing.T) { Recorder: record.NewFakeRecorder(10), } - route, err := reconciler.reconcileHTTPRoute(ctx, rayService) + err := reconciler.reconcileHTTPRoute(ctx, rayService) require.NoError(t, err) - require.NotNil(t, route) - assert.Equal(t, tt.expectedRouteName, route.Name) - assert.Equal(t, namespace, route.Namespace) + reconciledRoute := &gwv1.HTTPRoute{} + err = fakeClient.Get(ctx, client.ObjectKey{Name: tt.expectedRouteName, Namespace: namespace}, reconciledRoute) + require.NoError(t, err, "Failed to fetch the reconciled HTTPRoute") - assert.Equal(t, gwv1.ObjectName(activeServeService.Name), route.Spec.Rules[0].BackendRefs[0].Name) - assert.Equal(t, gwv1.ObjectName(pendingServeService.Name), route.Spec.Rules[0].BackendRefs[1].Name) + assert.Equal(t, tt.expectedRouteName, reconciledRoute.Name) + assert.Equal(t, namespace, reconciledRoute.Namespace) - require.Len(t, route.Spec.Rules[0].BackendRefs, 2) - assert.Equal(t, tt.expectedWeight, *route.Spec.Rules[0].BackendRefs[0].Weight) - assert.Equal(t, 100-tt.expectedWeight, *route.Spec.Rules[0].BackendRefs[1].Weight) + assert.Equal(t, gwv1.ObjectName(activeServeService.Name), reconciledRoute.Spec.Rules[0].BackendRefs[0].Name) + assert.Equal(t, gwv1.ObjectName(pendingServeService.Name), reconciledRoute.Spec.Rules[0].BackendRefs[1].Name) - parent := route.Spec.ParentRefs[0] + require.Len(t, reconciledRoute.Spec.Rules[0].BackendRefs, 2) + assert.Equal(t, tt.expectedWeight, *reconciledRoute.Spec.Rules[0].BackendRefs[0].Weight) + assert.Equal(t, 100-tt.expectedWeight, *reconciledRoute.Spec.Rules[0].BackendRefs[1].Weight) + + parent := reconciledRoute.Spec.ParentRefs[0] assert.Equal(t, gwv1.ObjectName("incremental-ray-service-gateway"), parent.Name) assert.Equal(t, ptr.To(gwv1.Namespace(namespace)), parent.Namespace) }) @@ -1749,14 +1753,17 @@ func TestReconcileGateway(t *testing.T) { Recorder: record.NewFakeRecorder(10), } - gw, err := reconciler.reconcileGateway(ctx, rayService) + err := reconciler.reconcileGateway(ctx, rayService) require.NoError(t, err) - require.NotNil(t, gw) - assert.Equal(t, tt.expectedGatewayName, gw.Name) - assert.Equal(t, namespace, gw.Namespace) - assert.Equal(t, gwv1.ObjectName(tt.expectedClass), gw.Spec.GatewayClassName) - assert.Len(t, gw.Spec.Listeners, tt.expectedNumListeners) + reconciledGateway := &gwv1.Gateway{} + err = fakeClient.Get(ctx, client.ObjectKey{Name: tt.expectedGatewayName, Namespace: namespace}, reconciledGateway) + require.NoError(t, err, "Failed to get the reconciled Gateway") + + assert.Equal(t, tt.expectedGatewayName, reconciledGateway.Name) + assert.Equal(t, namespace, reconciledGateway.Namespace) + assert.Equal(t, gwv1.ObjectName(tt.expectedClass), reconciledGateway.Spec.GatewayClassName) + assert.Len(t, reconciledGateway.Spec.Listeners, tt.expectedNumListeners) }) } } @@ -2111,7 +2118,7 @@ func TestReconcilePerClusterServeService(t *testing.T) { Recorder: record.NewFakeRecorder(1), } - reconciledSvc, err := reconciler.reconcilePerClusterServeService(ctx, rayService, tt.rayCluster) + err := reconciler.reconcilePerClusterServeService(ctx, rayService, tt.rayCluster) if tt.expectError { require.Error(t, err) @@ -2119,12 +2126,18 @@ func TestReconcilePerClusterServeService(t *testing.T) { } require.NoError(t, err) + reconciledSvc := &corev1.Service{} + err = fakeClient.Get(ctx, client.ObjectKey{Name: expectedServeSvcName, Namespace: namespace}, reconciledSvc) + // No-op case, no service should be created when RayCluster is nil. if tt.rayCluster == nil { - assert.Nil(t, reconciledSvc) + assert.True(t, errors.IsNotFound(err)) return } + // Otherwise, a valid serve service should be created for the RayCluster. + require.NoError(t, err, "The Serve service should exist in the client") + // Validate the expected Serve service exists for the RayCluster. require.NotNil(t, reconciledSvc) assert.Equal(t, expectedServeSvcName, reconciledSvc.Name) diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go index ad31b98af96..faef5106dfe 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicespec.go @@ -15,8 +15,6 @@ type RayServiceSpecApplyConfiguration struct { ServeService *corev1.Service `json:"serveService,omitempty"` UpgradeStrategy *RayServiceUpgradeStrategyApplyConfiguration `json:"upgradeStrategy,omitempty"` ServeConfigV2 *string `json:"serveConfigV2,omitempty"` - Gateway *string `json:"gateway,omitempty"` - HTTPRoute *string `json:"httpRoute,omitempty"` RayClusterSpec *RayClusterSpecApplyConfiguration `json:"rayClusterConfig,omitempty"` ExcludeHeadPodFromServeSvc *bool `json:"excludeHeadPodFromServeSvc,omitempty"` } @@ -75,22 +73,6 @@ func (b *RayServiceSpecApplyConfiguration) WithServeConfigV2(value string) *RayS return b } -// WithGateway sets the Gateway field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Gateway field is set to the value of the last call. -func (b *RayServiceSpecApplyConfiguration) WithGateway(value string) *RayServiceSpecApplyConfiguration { - b.Gateway = &value - return b -} - -// WithHTTPRoute sets the HTTPRoute field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the HTTPRoute field is set to the value of the last call. -func (b *RayServiceSpecApplyConfiguration) WithHTTPRoute(value string) *RayServiceSpecApplyConfiguration { - b.HTTPRoute = &value - return b -} - // WithRayClusterSpec sets the RayClusterSpec field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the RayClusterSpec field is set to the value of the last call. From f6653539f68d431430d54f2652d70b802ea15c65 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 1 Oct 2025 13:58:37 +0000 Subject: [PATCH 26/56] Fix port errors Signed-off-by: Ryan O'Leary --- ray-operator/controllers/ray/rayservice_controller.go | 2 +- ray-operator/controllers/ray/utils/util.go | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index a341ee21ba3..273c6ef77cf 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -486,7 +486,7 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayService) (*gwv1.Gateway, error) { options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) if options == nil { - return nil, errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade") + return nil, errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade.") } gatewayName := utils.CheckGatewayName(rayServiceInstance.Name + "-gateway") diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 72f8ae9e57d..e36c805c612 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -786,8 +786,9 @@ func GetGatewayListenersForRayService(rayServiceInstance *rayv1.RayService) []gw listenerName := fmt.Sprintf("%s-listener", rayServiceInstance.Name) listener := gwv1.Listener{ Name: gwv1.SectionName(listenerName), - Protocol: gwv1.HTTPProtocolType, // only support HTTP - Port: utils.FindContainerPort(rayContainer, utils.ServingPortName, utils.DefaultServingPort) + Protocol: gwv1.HTTPProtocolType, // only support HTTP + Port: gwv1.PortNumber(int32(80)), + } listeners = append(listeners, listener) return listeners From f5fb7aea85d218e59f7339c26f54353d5ecb50cc Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 2 Oct 2025 07:52:45 +0000 Subject: [PATCH 27/56] Fix comments and build issues Signed-off-by: Ryan O'Leary --- helm-chart/kuberay-operator/README.md | 2 +- helm-chart/kuberay-operator/values.yaml | 2 +- ray-operator/controllers/ray/common/association.go | 2 +- .../controllers/ray/rayservice_controller_unit_test.go | 8 ++++---- ray-operator/controllers/ray/utils/validation.go | 1 + ray-operator/controllers/ray/utils/validation_test.go | 4 ++++ 6 files changed, 12 insertions(+), 7 deletions(-) diff --git a/helm-chart/kuberay-operator/README.md b/helm-chart/kuberay-operator/README.md index 179ab95cd0c..34d6692775d 100644 --- a/helm-chart/kuberay-operator/README.md +++ b/helm-chart/kuberay-operator/README.md @@ -173,7 +173,7 @@ spec: | featureGates[1].name | string | `"RayJobDeletionPolicy"` | | | featureGates[1].enabled | bool | `false` | | | featureGates[2].name | string | `"RayServiceIncrementalUpgrade"` | | -| featureGates[2].enabled | bool | `true` | | +| featureGates[2].enabled | bool | `false` | | | metrics.enabled | bool | `true` | Whether KubeRay operator should emit control plane metrics. | | metrics.serviceMonitor.enabled | bool | `false` | Enable a prometheus ServiceMonitor | | metrics.serviceMonitor.interval | string | `"30s"` | Prometheus ServiceMonitor interval | diff --git a/helm-chart/kuberay-operator/values.yaml b/helm-chart/kuberay-operator/values.yaml index 78b6ac6e592..fb3c3ba4c33 100644 --- a/helm-chart/kuberay-operator/values.yaml +++ b/helm-chart/kuberay-operator/values.yaml @@ -118,7 +118,7 @@ featureGates: - name: RayJobDeletionPolicy enabled: false - name: RayServiceIncrementalUpgrade - enabled: true + enabled: false # Configurations for KubeRay operator metrics. metrics: diff --git a/ray-operator/controllers/ray/common/association.go b/ray-operator/controllers/ray/common/association.go index f053e8da583..922a31d924f 100644 --- a/ray-operator/controllers/ray/common/association.go +++ b/ray-operator/controllers/ray/common/association.go @@ -213,7 +213,7 @@ func RayServiceGatewayNamespacedName(rayService *rayv1.RayService) types.Namespa } func RayServiceHTTPRouteNamespacedName(rayService *rayv1.RayService) types.NamespacedName { - httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s", rayService.Name)) + httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s-gateway", rayService.Name)) return types.NamespacedName{ Name: httpRouteName, Namespace: rayService.Namespace, diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 693e26ee8d5..2a63dd3ce69 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1533,7 +1533,7 @@ func TestCreateHTTPRoute(t *testing.T) { require.NoError(t, err) require.NotNil(t, route) - assert.Equal(t, "httproute-incremental-ray-service", route.Name) + assert.Equal(t, "httproute-incremental-ray-service-gateway", route.Name) assert.Equal(t, "test-ns", route.Namespace) require.Len(t, route.Spec.Rules, 1) @@ -1646,7 +1646,7 @@ func TestReconcileHTTPRoute(t *testing.T) { rayService, activeServeService, pendingServeService, activeCluster, pendingCluster, gateway, }, - expectedRouteName: "httproute-incremental-ray-service", + expectedRouteName: "httproute-incremental-ray-service-gateway", expectedWeight: 80, }, { @@ -1656,7 +1656,7 @@ func TestReconcileHTTPRoute(t *testing.T) { activeCluster, pendingCluster, gateway, existingHTTPRoute, }, - expectedRouteName: "httproute-incremental-ray-service", + expectedRouteName: "httproute-incremental-ray-service-gateway", expectedWeight: 80, }, } @@ -1925,7 +1925,7 @@ func makeHTTPRoute(name, namespace string, isReady bool) *gwv1.HTTPRoute { func TestCheckIfNeedIncrementalUpgradeUpdate(t *testing.T) { rayServiceName := "test-rayservice" gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway") - httpRouteName := fmt.Sprintf("%s-%s", "httproute", rayServiceName) + httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName) namespace := "test-ns" tests := []struct { diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index bf7f8ab1aaa..e80201afdfe 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -338,6 +338,7 @@ func ValidateIncrementalUpgradeOptions(rayService *rayv1.RayService) error { return fmt.Errorf("IncrementalUpgradeOptions are required for IncrementalUpgrade") } + // MaxSurgePercent defaults to 100% if unset. if *options.MaxSurgePercent < 0 || *options.MaxSurgePercent > 100 { return fmt.Errorf("maxSurgePercent must be between 0 and 100") } diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index 4cefdda28b8..ee346b8b6c1 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1687,6 +1687,7 @@ func TestValidateIncrementalUpgradeOptions(t *testing.T) { }, { name: "missing autoscaler", + maxSurgePercent: ptr.To(int32(50)), stepSizePercent: ptr.To(int32(50)), intervalSeconds: ptr.To(int32(10)), gatewayClassName: "istio", @@ -1709,6 +1710,7 @@ func TestValidateIncrementalUpgradeOptions(t *testing.T) { }, { name: "missing StepSizePercent", + maxSurgePercent: ptr.To(int32(50)), intervalSeconds: ptr.To(int32(10)), gatewayClassName: "istio", enableAutoscaling: true, @@ -1716,6 +1718,7 @@ func TestValidateIncrementalUpgradeOptions(t *testing.T) { }, { name: "invalid IntervalSeconds", + maxSurgePercent: ptr.To(int32(50)), stepSizePercent: ptr.To(int32(50)), intervalSeconds: ptr.To(int32(0)), gatewayClassName: "istio", @@ -1724,6 +1727,7 @@ func TestValidateIncrementalUpgradeOptions(t *testing.T) { }, { name: "missing GatewayClassName", + maxSurgePercent: ptr.To(int32(50)), stepSizePercent: ptr.To(int32(50)), intervalSeconds: ptr.To(int32(10)), enableAutoscaling: true, From a553b1e4dffbd2da3a3d628816dbe90fd11364d6 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 2 Oct 2025 07:58:35 +0000 Subject: [PATCH 28/56] fix helm-chart-verify-rbac Signed-off-by: Ryan O'Leary --- helm-chart/kuberay-operator/templates/_helpers.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm-chart/kuberay-operator/templates/_helpers.tpl b/helm-chart/kuberay-operator/templates/_helpers.tpl index 49bbd71f490..d5e0e7352d0 100644 --- a/helm-chart/kuberay-operator/templates/_helpers.tpl +++ b/helm-chart/kuberay-operator/templates/_helpers.tpl @@ -231,8 +231,8 @@ rules: - create - get - list - - watch - update + - watch - apiGroups: - networking.k8s.io resources: From 7e231db5ba214fe0882489f8b3206c1ac3af6092 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 3 Oct 2025 14:14:10 +0000 Subject: [PATCH 29/56] Refactor tests and create HTTPRoute to be clearer Signed-off-by: Ryan O'Leary --- .../controllers/ray/rayservice_controller.go | 333 ++++++++-------- .../ray/rayservice_controller_unit_test.go | 355 +++++++++--------- .../rayservice_incremental_upgrade_test.go | 133 +++---- .../test/e2eincrementalupgrade/support.go | 86 +++++ 4 files changed, 479 insertions(+), 428 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 273c6ef77cf..5467bc29361 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -163,7 +163,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque if err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) } - // Create or update the HTTPRoute attached to this RayService's Gateway + // Create or update the HTTPRoute attached to this RayService's Gateway. err = r.reconcileHTTPRoute(ctx, rayServiceInstance) if err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) @@ -556,36 +556,69 @@ func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceI return nil } +// reconcileTrafficRoutedPercent determines the traffic split between the active and pending clusters during an upgrade, +// returning the weights for the old and new clusters respectively, or an error if misconfigured. +func (r *RayServiceReconciler) reconcileTrafficRoutedPercent(ctx context.Context, rayServiceInstance *rayv1.RayService, hasPendingCluster bool) (activeClusterWeight, pendingClusterWeight int32, err error) { + logger := ctrl.LoggerFrom(ctx) + activeServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus + pendingServiceStatus := &rayServiceInstance.Status.PendingServiceStatus + + // Default to 100% traffic on the active cluster. + activeClusterWeight = 100 + pendingClusterWeight = 0 + + if hasPendingCluster { + // Zero-downtime upgrade in progress. + options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) + if options == nil { + return 0, 0, errstd.New("IncrementalUpgradeOptions are not set during upgrade.") + } + + // Check that target_capacity has been updated before migrating traffic. + pendingClusterWeight = ptr.Deref(pendingServiceStatus.TrafficRoutedPercent, 0) + pendingClusterTargetCapacity := ptr.Deref(pendingServiceStatus.TargetCapacity, 0) + activeClusterWeight = ptr.Deref(activeServiceStatus.TrafficRoutedPercent, 100) + + if pendingClusterWeight == pendingClusterTargetCapacity { + // return without changing current traffic weights since cluster being migrated to is at capacity. + return activeClusterWeight, pendingClusterWeight, nil + } + + // If IntervalSeconds has passed since LastTrafficMigratedTime, migrate StepSizePercent traffic + // from the active RayCluster to the pending RayCluster. + intervalSeconds := time.Duration(*options.IntervalSeconds) * time.Second + lastTrafficMigratedTime := pendingServiceStatus.LastTrafficMigratedTime + if lastTrafficMigratedTime == nil || time.Since(lastTrafficMigratedTime.Time) >= intervalSeconds { + // Gradually shift traffic from the active to the pending cluster. + logger.Info("Upgrade in progress. Migrating traffic by StepSizePercent.", "stepSize", *options.StepSizePercent) + proposedPendingWeight := pendingClusterWeight + *options.StepSizePercent + pendingClusterWeight = min(100, proposedPendingWeight, pendingClusterTargetCapacity) + activeClusterWeight = 100 - pendingClusterWeight + + pendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} + activeServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} + } + } + + // Update the RayService status with the calculated traffic weights. + activeServiceStatus.TrafficRoutedPercent = ptr.To(activeClusterWeight) + pendingServiceStatus.TrafficRoutedPercent = ptr.To(pendingClusterWeight) + logger.Info("Updated TrafficRoutedPercent", "activeClusterWeight", activeClusterWeight, "pendingClusterWeight", pendingClusterWeight) + + return activeClusterWeight, pendingClusterWeight, nil +} + // createHTTPRoute creates a desired HTTPRoute object based on a given RayService instance with // weights based on TrafficRoutedPercent. func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.HTTPRoute, error) { logger := ctrl.LoggerFrom(ctx) - // Retrieve Gateway instance to attach this HTTPRoute to + // Retrieve Gateway instance to attach this HTTPRoute to. gatewayInstance := &gwv1.Gateway{} if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil { return nil, err } - // Define the desired HTTPRoute name and basic object - httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s", gatewayInstance.Name)) - desiredHTTPRoute := &gwv1.HTTPRoute{ - ObjectMeta: metav1.ObjectMeta{ - Name: httpRouteName, - Namespace: rayServiceInstance.Namespace, - }, - Spec: gwv1.HTTPRouteSpec{ - CommonRouteSpec: gwv1.CommonRouteSpec{ - ParentRefs: []gwv1.ParentReference{ - { - Name: gwv1.ObjectName(gatewayInstance.Name), - Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)), - }, - }, - }, - }, - } - // Retrieve the active RayCluster activeRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServiceActiveRayClusterNamespacedName(rayServiceInstance)) if err != nil && !errors.IsNotFound(err) { @@ -596,131 +629,76 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn logger.Info("Active RayCluster not found, skipping HTTPRoute creation.") return nil, nil } - // Serve service points to the active RayCluster until the upgrade is complete. - oldClusterServeSvcName := utils.GenerateServeServiceName(activeRayCluster.Name) - oldServeSvc := &corev1.Service{} - if err := r.Get(ctx, client.ObjectKey{Name: oldClusterServeSvcName, Namespace: rayServiceInstance.Namespace}, oldServeSvc); err != nil { - logger.Error(err, "Failed to retrieve active RayCluster serve service.") - return nil, err - } // Attempt to retrieve pending RayCluster pendingRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServicePendingRayClusterNamespacedName(rayServiceInstance)) hasPendingCluster := (err == nil && pendingRayCluster != nil) if err != nil && !errors.IsNotFound(err) { logger.Info("Failed to retrieve pending RayCluster.") + return nil, err } - activeServiceStatus := rayServiceInstance.Status.ActiveServiceStatus - - var backendRefs []gwv1.HTTPBackendRef - - // Configure HTTPRoute to split traffic between active and pending clusters during an incremental upgrade - if hasPendingCluster { - newClusterServeSvcName := utils.GenerateServeServiceName(pendingRayCluster.Name) - newServeSvc := &corev1.Service{} - if err := r.Get(ctx, client.ObjectKey{Name: newClusterServeSvcName, Namespace: rayServiceInstance.Namespace}, newServeSvc); err != nil { - logger.Error(err, "Failed to retrieve pending RayCluster serve service.") - return nil, err - } - - options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) - if options == nil { - return nil, errstd.New("Missing RayService IncrementalUpgradeOptions.") - } - - // Retrieve TrafficRoutedPercent for old and upgraded RayClusters. - pendingServiceStatus := rayServiceInstance.Status.PendingServiceStatus - newClusterWeight := pendingServiceStatus.TrafficRoutedPercent - oldClusterWeight := activeServiceStatus.TrafficRoutedPercent - - // If IntervalSeconds has passed since LastTrafficMigratedTime, migrate - // StepSizePercent traffic to the pending cluster. - intervalSeconds := time.Duration(*options.IntervalSeconds) * time.Second - lastTrafficMigratedTime := pendingServiceStatus.LastTrafficMigratedTime - if (newClusterWeight != nil && oldClusterWeight != nil) && (lastTrafficMigratedTime == nil || time.Since(lastTrafficMigratedTime.Time) >= intervalSeconds) { - // Wait an initial iteration before migrating StepSizePercent. - if lastTrafficMigratedTime != nil { - logger.Info("Updating active and pending cluster weights each by StepSizePercent.") - oldClusterWeight = ptr.To(max(*oldClusterWeight-*options.StepSizePercent, 0)) - newClusterWeight = ptr.To(min(*newClusterWeight+*options.StepSizePercent, 100)) - } - rayServiceInstance.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} - rayServiceInstance.Status.ActiveServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} - } + activeClusterWeight, pendingClusterWeight, err := r.reconcileTrafficRoutedPercent(ctx, rayServiceInstance, hasPendingCluster) + if err != nil { + logger.Info("Failed to reconcile TrafficRoutedPercent for active and pending clusters.") + return nil, err + } - // Set weights for initial iteration. - if newClusterWeight == nil { - // Pending RayCluster should scale up from 0 TrafficRoutedPercent. - newClusterWeight = ptr.To(int32(0)) - } - if oldClusterWeight == nil { - // Active RayCluster should scale down from 100 TrafficRoutedPercent. - oldClusterWeight = ptr.To(int32(100)) - } - // HTTPRoute weights should never exceed current TargetCapacity for each cluster. - newClusterTargetCapacity := pendingServiceStatus.TargetCapacity - oldClusterTargetCapacity := activeServiceStatus.TargetCapacity - if newClusterTargetCapacity != nil { - newClusterWeight = ptr.To(min(*newClusterWeight, *newClusterTargetCapacity)) - } - if oldClusterTargetCapacity != nil { - oldClusterWeight = ptr.To(min(*oldClusterWeight, *oldClusterTargetCapacity)) - } + activeClusterServeSvcName := utils.GenerateServeServiceName(activeRayCluster.Name) - backendRefs = []gwv1.HTTPBackendRef{ - { - BackendRef: gwv1.BackendRef{ - BackendObjectReference: gwv1.BackendObjectReference{ - Name: gwv1.ObjectName(oldClusterServeSvcName), - Namespace: ptr.To(gwv1.Namespace(rayServiceInstance.Namespace)), - Port: ptr.To(gwv1.PortNumber(8000)), // set to Serve port - }, - Weight: oldClusterWeight, + backendRefs := []gwv1.HTTPBackendRef{ + { + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{ + Name: gwv1.ObjectName(activeClusterServeSvcName), + Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)), + Port: ptr.To(gwv1.PortNumber(8000)), }, + Weight: ptr.To(activeClusterWeight), }, - { - BackendRef: gwv1.BackendRef{ - BackendObjectReference: gwv1.BackendObjectReference{ - Name: gwv1.ObjectName(newClusterServeSvcName), - Namespace: ptr.To(gwv1.Namespace(rayServiceInstance.Namespace)), - Port: ptr.To(gwv1.PortNumber(8000)), - }, - Weight: newClusterWeight, + }, + } + + if hasPendingCluster { + pendingClusterServeSvcName := utils.GenerateServeServiceName(pendingRayCluster.Name) + + backendRefs = append(backendRefs, gwv1.HTTPBackendRef{ + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{ + Name: gwv1.ObjectName(pendingClusterServeSvcName), + Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)), + Port: ptr.To(gwv1.PortNumber(8000)), }, + Weight: ptr.To(pendingClusterWeight), }, - } - logger.Info("Updating TrafficRoutedPercent to", "oldClusterWeight", oldClusterWeight, "newClusterWeight", newClusterWeight) - rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent = oldClusterWeight - rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent = newClusterWeight - } else { - // No pending cluster — route 100% to active RayCluster - backendRefs = []gwv1.HTTPBackendRef{ - { - BackendRef: gwv1.BackendRef{ - BackendObjectReference: gwv1.BackendObjectReference{ - Name: gwv1.ObjectName(oldClusterServeSvcName), - Namespace: ptr.To(gwv1.Namespace(rayServiceInstance.Namespace)), - Port: ptr.To(gwv1.PortNumber(8000)), + }) + } + + httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s", gatewayInstance.Name)) + desiredHTTPRoute := &gwv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{Name: httpRouteName, Namespace: gatewayInstance.Namespace}, + Spec: gwv1.HTTPRouteSpec{ + CommonRouteSpec: gwv1.CommonRouteSpec{ + ParentRefs: []gwv1.ParentReference{ + { + Name: gwv1.ObjectName(gatewayInstance.Name), + Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)), }, - Weight: ptr.To(int32(100)), }, }, - } - rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(int32(100)) - } - - desiredHTTPRoute.Spec.Rules = []gwv1.HTTPRouteRule{ - { - Matches: []gwv1.HTTPRouteMatch{ + Rules: []gwv1.HTTPRouteRule{ { - Path: &gwv1.HTTPPathMatch{ - Type: ptr.To(gwv1.PathMatchPathPrefix), - Value: ptr.To("/"), + Matches: []gwv1.HTTPRouteMatch{ + { + Path: &gwv1.HTTPPathMatch{ + Type: ptr.To(gwv1.PathMatchPathPrefix), + Value: ptr.To("/"), + }, + }, }, + BackendRefs: backendRefs, }, }, - BackendRefs: backendRefs, }, } @@ -1180,6 +1158,52 @@ func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.C return true, "Active RayCluster TargetCapacity has not finished scaling down." } +// applyServeTargetCapacity updates the target_capacity for a given RayCluster's Serve applications. +func (r *RayServiceReconciler) applyServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient dashboardclient.RayDashboardClientInterface, goalTargetCapacity int32) error { + logger := ctrl.LoggerFrom(ctx).WithValues("RayCluster", rayClusterInstance.Name) + + // Retrieve cached ServeConfig from last reconciliation for cluster to update + cachedConfig := r.getServeConfigFromCache(rayServiceInstance, rayClusterInstance.Name) + if cachedConfig == "" { + cachedConfig = rayServiceInstance.Spec.ServeConfigV2 + } + + serveConfig := make(map[string]interface{}) + if err := yaml.Unmarshal([]byte(cachedConfig), &serveConfig); err != nil { + return err + } + + // Check if ServeConfig requires update + if currentTargetCapacity, ok := serveConfig["target_capacity"].(float64); ok { + if int32(currentTargetCapacity) == goalTargetCapacity { + logger.Info("target_capacity already updated on RayCluster", "target_capacity", currentTargetCapacity) + // No update required, return early + return nil + } + } + + serveConfig["target_capacity"] = goalTargetCapacity + configJson, err := json.Marshal(serveConfig) + if err != nil { + return fmt.Errorf("failed to marshal serve config: %w", err) + } + + logger.Info("Applying new target_capacity to Ray cluster.", "goal", goalTargetCapacity) + if err := rayDashboardClient.UpdateDeployments(ctx, configJson); err != nil { + return fmt.Errorf("failed to update target_capacity for Serve applications: %w", err) + } + + // Update the status fields and cache new Serve config. + if rayClusterInstance.Name == rayServiceInstance.Status.ActiveServiceStatus.RayClusterName { + rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) + } else if rayClusterInstance.Name == rayServiceInstance.Status.PendingServiceStatus.RayClusterName { + rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) + } + r.cacheServeConfig(rayServiceInstance, rayClusterInstance.Name) + + return nil +} + // reconcileServeTargetCapacity reconciles the target_capacity of the ServeConfig for a given RayCluster during // an IncrementalUpgrade while also updating the Status.TargetCapacity of the Active and Pending RayServices. func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient dashboardclient.RayDashboardClientInterface) error { @@ -1202,22 +1226,22 @@ func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, } // Retrieve the current observed Status fields for IncrementalUpgrade - activeTargetCapacity := *activeRayServiceStatus.TargetCapacity - pendingTargetCapacity := *pendingRayServiceStatus.TargetCapacity - pendingTrafficRoutedPercent := *pendingRayServiceStatus.TrafficRoutedPercent - - // Defer updating the target_capacity until traffic weights are updated - if pendingTargetCapacity != pendingTrafficRoutedPercent { - logger.Info("Traffic is currently being migrated to pending cluster", "RayCluster", pendingRayServiceStatus.RayClusterName, "TargetCapacity", pendingTargetCapacity, "TrafficRoutedPercent", pendingTrafficRoutedPercent) - return nil - } + activeTargetCapacity := ptr.Deref(activeRayServiceStatus.TargetCapacity, 100) + pendingTargetCapacity := ptr.Deref(pendingRayServiceStatus.TargetCapacity, 0) + pendingTrafficRoutedPercent := ptr.Deref(pendingRayServiceStatus.TrafficRoutedPercent, 0) // Retrieve MaxSurgePercent - the maximum amount to change TargetCapacity by options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) if options == nil { return errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade") } - maxSurgePercent := *options.MaxSurgePercent + maxSurgePercent := ptr.Deref(options.MaxSurgePercent, 100) + + // Defer updating the target_capacity until traffic weights are updated + if pendingTargetCapacity != pendingTrafficRoutedPercent { + logger.Info("Traffic is currently being migrated to pending cluster", "RayCluster", pendingRayServiceStatus.RayClusterName, "TargetCapacity", pendingTargetCapacity, "TrafficRoutedPercent", pendingTrafficRoutedPercent) + return nil + } // There are two cases: // 1. The total target_capacity is greater than 100. This means the pending RayCluster has @@ -1248,44 +1272,7 @@ func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, logger.Info("Setting target_capacity for pending Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity) } - // Retrieve cached ServeConfig from last reconciliation for cluster to update - cachedConfig := r.getServeConfigFromCache(rayServiceInstance, clusterName) - if cachedConfig == "" { - cachedConfig = rayServiceInstance.Spec.ServeConfigV2 - } - logger.Info("Retrieving ServeConfig", "cached", cachedConfig, "ServeConfigV2", rayServiceInstance.Spec.ServeConfigV2) - serveConfig := make(map[string]interface{}) - if err := yaml.Unmarshal([]byte(cachedConfig), &serveConfig); err != nil { - return err - } - - // Check if ServeConfig requires update - if currentTargetCapacity, ok := serveConfig["target_capacity"].(float64); ok { - if int32(currentTargetCapacity) == goalTargetCapacity { - logger.Info("target_capacity already updated on RayCluster", "RayCluster", clusterName, "target_capacity", currentTargetCapacity) - // No update required, return early - return nil - } - } - - // Otherwise, update the target_capacity for the cached ServeConfig - serveConfig["target_capacity"] = goalTargetCapacity - configJson, err := json.Marshal(serveConfig) - if err != nil { - return fmt.Errorf("failed to marshal converted serve config into bytes: %w", err) - } - logger.Info("reconcileServeTargetCapacity", "MULTI_APP json config", string(configJson)) - if err := rayDashboardClient.UpdateDeployments(ctx, configJson); err != nil { - err = fmt.Errorf( - "fail to create / update target_capacity for Serve applications. err: %w", err) - return err - } - - // Only update the target_capacity of one RayCluster at a time. - r.cacheServeConfig(rayServiceInstance, clusterName) - logger.Info("reconcileServeTargetCapacity", "message", "Cached Serve config for Ray cluster with the key", "rayClusterName", clusterName) - - return nil + return r.applyServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient, goalTargetCapacity) } // `getAndCheckServeStatus` gets Serve applications' and deployments' statuses and check whether the diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 2a63dd3ce69..9076c184b13 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1451,81 +1451,119 @@ func TestCreateGateway(t *testing.T) { } func TestCreateHTTPRoute(t *testing.T) { - // Create re-used runtime objects for test cases + ctx := context.TODO() namespace := "test-ns" - activeCluster := &rayv1.RayCluster{ - ObjectMeta: metav1.ObjectMeta{ - Name: "active-ray-cluster", - Namespace: namespace, - }, - } - pendingCluster := &rayv1.RayCluster{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pending-ray-cluster", - Namespace: namespace, - }, - } - gateway := &gwv1.Gateway{ - ObjectMeta: metav1.ObjectMeta{ - Name: "incremental-ray-service-gateway", - Namespace: namespace, - }, - } - activeServeService := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: utils.GenerateServeServiceName(activeCluster.Name), - Namespace: namespace, + stepSize := int32(10) + interval := int32(30) + + activeCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "rayservice-active", Namespace: namespace}} + pendingCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "rayservice-pending", Namespace: namespace}} + gateway := &gwv1.Gateway{ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice-gateway", Namespace: namespace}} + activeServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace}} + pendingServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace}} + + baseRayService := &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace}, + Spec: rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + StepSizePercent: &stepSize, + IntervalSeconds: &interval, + GatewayClassName: "istio", + }, + }, }, - } - pendingServeService := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: utils.GenerateServeServiceName(pendingCluster.Name), - Namespace: namespace, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: activeCluster.Name, + TrafficRoutedPercent: ptr.To(int32(100)), + TargetCapacity: ptr.To(int32(100)), + }, + PendingServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: pendingCluster.Name, + TrafficRoutedPercent: ptr.To(int32(0)), + TargetCapacity: ptr.To(int32(30)), + }, }, } tests := []struct { - rayService *rayv1.RayService - name string - routedPercent int32 - expectError bool + name string + modifier func(rs *rayv1.RayService) + runtimeObjects []runtime.Object + expectError bool + expectedActiveWeight int32 + expectedPendingWeight int32 }{ { - name: "valid HTTPRoute creation", - routedPercent: int32(80), - rayService: makeIncrementalUpgradeRayService(true, "gateway-class", ptr.To(int32(50)), ptr.To(int32(1000)), ptr.To(int32(80)), &metav1.Time{Time: time.Now()}), - expectError: false, + name: "Incremental upgrade, time since LastTrafficMigratedTime < IntervalSeconds.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectedActiveWeight: 100, + expectedPendingWeight: 0, + }, + { + name: "Incremental upgrade, time since LastTrafficMigratedTime >= IntervalSeconds.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} + rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(60)) + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectedActiveWeight: 90, + expectedPendingWeight: 10, + }, + { + name: "Incremental upgrade, TrafficRoutedPercent capped to pending TargetCapacity.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} + rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(5)) + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectedActiveWeight: 95, + expectedPendingWeight: 5, // can only migrate 5% to pending until TargetCapacity reached + }, + { + name: "Create HTTPRoute called with missing IncrementalUpgradeOptions.", + modifier: func(rs *rayv1.RayService) { + rs.Spec.UpgradeStrategy.IncrementalUpgradeOptions = nil + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectError: true, }, { - name: "missing IncrementalUpgradeOptions", - routedPercent: int32(50), - rayService: makeIncrementalUpgradeRayService(false, "gateway-class", ptr.To(int32(50)), ptr.To(int32(120)), ptr.To(int32(50)), &metav1.Time{Time: time.Now()}), - expectError: true, + name: "No on-going upgrade, pending cluster does not exist.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus = rayv1.RayServiceStatus{} + }, + runtimeObjects: []runtime.Object{activeCluster, gateway, activeServeService}, + expectedActiveWeight: 100, + expectedPendingWeight: 0, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + rayService := baseRayService.DeepCopy() + tt.modifier(rayService) + tt.runtimeObjects = append(tt.runtimeObjects, rayService) + newScheme := runtime.NewScheme() - _ = corev1.AddToScheme(newScheme) _ = rayv1.AddToScheme(newScheme) + _ = corev1.AddToScheme(newScheme) _ = gwv1.AddToScheme(newScheme) + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build() - // Setup runtime test objects. - runtimeObjects := []runtime.Object{ - tt.rayService, activeServeService, pendingServeService, - pendingCluster, activeCluster, gateway, - } - - fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() reconciler := RayServiceReconciler{ Client: fakeClient, Scheme: newScheme, Recorder: record.NewFakeRecorder(1), } - ctx := context.TODO() - route, err := reconciler.createHTTPRoute(ctx, tt.rayService) + route, err := reconciler.createHTTPRoute(ctx, rayService) + if tt.expectError { require.Error(t, err) assert.Nil(t, route) @@ -1533,18 +1571,22 @@ func TestCreateHTTPRoute(t *testing.T) { require.NoError(t, err) require.NotNil(t, route) - assert.Equal(t, "httproute-incremental-ray-service-gateway", route.Name) + assert.Equal(t, "httproute-test-rayservice-gateway", route.Name) assert.Equal(t, "test-ns", route.Namespace) require.Len(t, route.Spec.Rules, 1) rule := route.Spec.Rules[0] - require.Len(t, rule.BackendRefs, 2) + require.GreaterOrEqual(t, len(rule.BackendRefs), 1) assert.Equal(t, gwv1.ObjectName(activeServeService.Name), rule.BackendRefs[0].BackendRef.Name) - assert.Equal(t, gwv1.ObjectName(pendingServeService.Name), rule.BackendRefs[1].BackendRef.Name) - - assert.Equal(t, tt.routedPercent, *rule.BackendRefs[0].Weight) - assert.Equal(t, int32(100)-tt.routedPercent, *rule.BackendRefs[1].Weight) + assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight) + + if len(rule.BackendRefs) > 1 { + assert.Equal(t, gwv1.ObjectName(pendingServeService.Name), rule.BackendRefs[1].BackendRef.Name) + assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight) + } else { + assert.Equal(t, int32(0), tt.expectedPendingWeight) + } } }) } @@ -1558,141 +1600,110 @@ func TestReconcileHTTPRoute(t *testing.T) { ctx := context.TODO() namespace := "test-ns" - - rayService := makeIncrementalUpgradeRayService( - true, - "incremental-ray-service-gateway", - ptr.To(int32(20)), - ptr.To(int32(30)), - ptr.To(int32(80)), - ptr.To(metav1.Now()), - ) - - activeCluster := &rayv1.RayCluster{ - ObjectMeta: metav1.ObjectMeta{ - Name: "active-ray-cluster", - Namespace: namespace, - }, - } - pendingCluster := &rayv1.RayCluster{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pending-ray-cluster", - Namespace: namespace, - }, - } - activeServeService := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: utils.GenerateServeServiceName(activeCluster.Name), - Namespace: namespace, - }, - } - pendingServeService := &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: utils.GenerateServeServiceName(pendingCluster.Name), - Namespace: namespace, - }, - } - gateway := &gwv1.Gateway{ - ObjectMeta: metav1.ObjectMeta{ - Name: "incremental-ray-service-gateway", - Namespace: namespace, - }, - } - - // Pre-existing HTTPRoute with incorrect weights - existingHTTPRoute := makeHTTPRoute(fmt.Sprintf("httproute-%s", rayService.Name), namespace, true) - existingHTTPRoute.Spec = gwv1.HTTPRouteSpec{ - CommonRouteSpec: gwv1.CommonRouteSpec{ - ParentRefs: []gwv1.ParentReference{{ - Name: gwv1.ObjectName("incremental-ray-service-gateway"), - Namespace: ptr.To(gwv1.Namespace(namespace)), - }}, - }, - Rules: []gwv1.HTTPRouteRule{{ - BackendRefs: []gwv1.HTTPBackendRef{ - { - BackendRef: gwv1.BackendRef{ - BackendObjectReference: gwv1.BackendObjectReference{ - Name: gwv1.ObjectName(activeServeService.Name), - Namespace: ptr.To(gwv1.Namespace(namespace)), - Port: ptr.To(gwv1.PortNumber(8000)), - }, - Weight: ptr.To(int32(5)), - }, - }, - { - BackendRef: gwv1.BackendRef{ - BackendObjectReference: gwv1.BackendObjectReference{ - Name: gwv1.ObjectName(pendingServeService.Name), - Namespace: ptr.To(gwv1.Namespace(namespace)), - Port: ptr.To(gwv1.PortNumber(8000)), - }, - Weight: ptr.To(int32(95)), - }, + stepSize := int32(10) + interval := int32(30) + gatewayName := "test-rayservice-gateway" + routeName := fmt.Sprintf("httproute-%s", gatewayName) + + activeCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "active-ray-cluster", Namespace: namespace}} + pendingCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "pending-ray-cluster", Namespace: namespace}} + activeServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace}} + pendingServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace}} + gateway := &gwv1.Gateway{ObjectMeta: metav1.ObjectMeta{Name: gatewayName, Namespace: namespace}} + + baseRayService := &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace}, + Spec: rayv1.RayServiceSpec{ + UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ + Type: ptr.To(rayv1.IncrementalUpgrade), + IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + StepSizePercent: &stepSize, + IntervalSeconds: &interval, + GatewayClassName: "istio", }, }, - }}, + }, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: activeCluster.Name, + TrafficRoutedPercent: ptr.To(int32(80)), + TargetCapacity: ptr.To(int32(100)), + }, + PendingServiceStatus: rayv1.RayServiceStatus{ + RayClusterName: pendingCluster.Name, + TrafficRoutedPercent: ptr.To(int32(20)), + TargetCapacity: ptr.To(int32(100)), + }, + }, } tests := []struct { - name string - expectedRouteName string - runtimeObjects []runtime.Object - expectedWeight int32 + modifier func(rs *rayv1.RayService) + existingRoute *gwv1.HTTPRoute + name string + expectedActiveWeight int32 + expectedPendingWeight int32 }{ { - name: "creates new HTTPRoute if not present", - runtimeObjects: []runtime.Object{ - rayService, activeServeService, pendingServeService, - activeCluster, pendingCluster, gateway, + name: "Create new HTTPRoute with weights.", + expectedActiveWeight: 70, + expectedPendingWeight: 30, + }, + { + name: "Existing HTTPRoute, time since LastTrafficMigratedTime >= IntervalSeconds so updates HTTPRoute.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} }, - expectedRouteName: "httproute-incremental-ray-service-gateway", - expectedWeight: 80, + existingRoute: &gwv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{Name: routeName, Namespace: namespace}, + Spec: gwv1.HTTPRouteSpec{}, + }, + expectedActiveWeight: 70, + expectedPendingWeight: 30, }, { - name: "updates HTTPRoute if spec differs", - runtimeObjects: []runtime.Object{ - rayService, activeServeService, pendingServeService, - activeCluster, pendingCluster, gateway, - existingHTTPRoute, + name: "Existing HTTPRoute, time since LastTrafficMigratedTime < IntervalSeconds so no update.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} }, - expectedRouteName: "httproute-incremental-ray-service-gateway", - expectedWeight: 80, + expectedActiveWeight: 80, + expectedPendingWeight: 20, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - fakeClient := clientFake.NewClientBuilder(). - WithScheme(newScheme). - WithRuntimeObjects(tt.runtimeObjects...). - Build() + rayService := baseRayService.DeepCopy() + if tt.modifier != nil { + tt.modifier(rayService) + } - reconciler := RayServiceReconciler{ - Client: fakeClient, - Scheme: newScheme, - Recorder: record.NewFakeRecorder(10), + runtimeObjects := []runtime.Object{rayService, activeCluster, pendingCluster, gateway, activeServeService, pendingServeService} + if tt.existingRoute != nil { + runtimeObjects = append(runtimeObjects, tt.existingRoute) } + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() + reconciler := RayServiceReconciler{Client: fakeClient, Scheme: newScheme, Recorder: record.NewFakeRecorder(10)} + err := reconciler.reconcileHTTPRoute(ctx, rayService) require.NoError(t, err) reconciledRoute := &gwv1.HTTPRoute{} - err = fakeClient.Get(ctx, client.ObjectKey{Name: tt.expectedRouteName, Namespace: namespace}, reconciledRoute) + err = fakeClient.Get(ctx, client.ObjectKey{Name: routeName, Namespace: namespace}, reconciledRoute) require.NoError(t, err, "Failed to fetch the reconciled HTTPRoute") - assert.Equal(t, tt.expectedRouteName, reconciledRoute.Name) - assert.Equal(t, namespace, reconciledRoute.Namespace) - - assert.Equal(t, gwv1.ObjectName(activeServeService.Name), reconciledRoute.Spec.Rules[0].BackendRefs[0].Name) - assert.Equal(t, gwv1.ObjectName(pendingServeService.Name), reconciledRoute.Spec.Rules[0].BackendRefs[1].Name) + require.Len(t, reconciledRoute.Spec.Rules, 1) + rule := reconciledRoute.Spec.Rules[0] + require.Len(t, rule.BackendRefs, 2) - require.Len(t, reconciledRoute.Spec.Rules[0].BackendRefs, 2) - assert.Equal(t, tt.expectedWeight, *reconciledRoute.Spec.Rules[0].BackendRefs[0].Weight) - assert.Equal(t, 100-tt.expectedWeight, *reconciledRoute.Spec.Rules[0].BackendRefs[1].Weight) + // Assert weights are set as expected. + assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight) + assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight) + // Assert ParentRef namespace is now correctly set. parent := reconciledRoute.Spec.ParentRefs[0] - assert.Equal(t, gwv1.ObjectName("incremental-ray-service-gateway"), parent.Name) + assert.Equal(t, gwv1.ObjectName(gatewayName), parent.Name) assert.Equal(t, ptr.To(gwv1.Namespace(namespace)), parent.Namespace) }) } @@ -1776,16 +1787,17 @@ func TestReconcileServeTargetCapacity(t *testing.T) { updatedCluster string activeCapacity int32 pendingCapacity int32 - routedPercent int32 + activeRoutedPercent int32 + pendingRoutedPercent int32 maxSurgePercent int32 expectedActiveCapacity int32 expectedPendingCapacity int32 }{ { name: "Scale up pending RayCluster when total TargetCapacity < 100", + pendingRoutedPercent: 10, activeCapacity: 70, pendingCapacity: 10, - routedPercent: 10, maxSurgePercent: 20, expectedActiveCapacity: 70, expectedPendingCapacity: 30, @@ -1793,9 +1805,9 @@ func TestReconcileServeTargetCapacity(t *testing.T) { }, { name: "Scale down active RayCluster when total TargetCapacity > 100", + pendingRoutedPercent: 30, activeCapacity: 80, pendingCapacity: 30, - routedPercent: 30, maxSurgePercent: 20, expectedActiveCapacity: 60, expectedPendingCapacity: 30, @@ -1818,13 +1830,14 @@ func TestReconcileServeTargetCapacity(t *testing.T) { }, Status: rayv1.RayServiceStatuses{ ActiveServiceStatus: rayv1.RayServiceStatus{ - RayClusterName: "active", - TargetCapacity: ptr.To(tt.activeCapacity), + RayClusterName: "active", + TargetCapacity: ptr.To(tt.activeCapacity), + TrafficRoutedPercent: ptr.To(tt.activeRoutedPercent), }, PendingServiceStatus: rayv1.RayServiceStatus{ RayClusterName: "pending", TargetCapacity: ptr.To(tt.pendingCapacity), - TrafficRoutedPercent: ptr.To(tt.routedPercent), + TrafficRoutedPercent: ptr.To(tt.pendingRoutedPercent), }, }, } @@ -1838,7 +1851,7 @@ func TestReconcileServeTargetCapacity(t *testing.T) { fakeDashboard := &utils.FakeRayDashboardClient{} reconciler := &RayServiceReconciler{ - ServeConfigs: lru.New(10), // empty initial cache + ServeConfigs: lru.New(10), } err := reconciler.reconcileServeTargetCapacity(ctx, rayService, rayCluster, fakeDashboard) @@ -2053,7 +2066,7 @@ func TestReconcilePerClusterServeService(t *testing.T) { } rayService := makeIncrementalUpgradeRayService( true, - "gateway-class", + "istio", ptr.To(int32(20)), ptr.To(int32(30)), ptr.To(int32(80)), diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go index 04c0cff111c..219e4636e6f 100644 --- a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -4,7 +4,6 @@ import ( "fmt" "strings" "testing" - "time" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -43,7 +42,7 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { // Create a RayService with IncrementalUpgrade enabled stepSize := ptr.To(int32(25)) - interval := ptr.To(int32(10)) + interval := ptr.To(int32(5)) maxSurge := ptr.To(int32(50)) rayServiceAC := rayv1ac.RayService(rayServiceName, namespace.Name). @@ -65,15 +64,17 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { g.Eventually(Gateway(test, rayService.Namespace, gatewayName), TestTimeoutMedium). Should(WithTransform(utils.IsGatewayReady, BeTrue())) + // Get the Gateway endpoint to send requests to gateway, err := GetGateway(test, namespace.Name, fmt.Sprintf("%s-%s", rayServiceName, "gateway")) g.Expect(err).NotTo(HaveOccurred()) g.Expect(gateway).NotTo(BeNil()) - httpRouteName := fmt.Sprintf("%s-%s", "httproute", rayServiceName) + httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName) LogWithTimestamp(test.T(), "Waiting for HTTPRoute %s/%s to be ready", rayService.Namespace, httpRouteName) g.Eventually(HTTPRoute(test, rayService.Namespace, httpRouteName), TestTimeoutMedium). Should(Not(BeNil())) - httpRoute, err := GetHTTPRoute(test, namespace.Name, fmt.Sprintf("%s-%s", "httproute", rayServiceName)) + + httpRoute, err := GetHTTPRoute(test, namespace.Name, httpRouteName) g.Expect(err).NotTo(HaveOccurred()) g.Expect(utils.IsHTTPRouteReady(gateway, httpRoute)).To(BeTrue()) @@ -90,7 +91,6 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { return updatedPod }, TestTimeoutShort).Should(WithTransform(IsPodRunningAndReady, BeTrue())) - // Get the Gateway endpoint to send requests to gatewayIP := GetGatewayIP(gateway) g.Expect(gatewayIP).NotTo(BeEmpty()) @@ -100,21 +100,25 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/calc", `["MUL", 3]`) g.Expect(stdout.String()).To(Equal("15 pizzas please!")) - // Trigger incremental upgrade by updating RayService serve config and RayCluster spec - rayService, err = GetRayService(test, namespace.Name, rayService.Name) - g.Expect(err).NotTo(HaveOccurred()) - - rayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceCPU] = resource.MustParse("500m") - serveConfig := rayService.Spec.ServeConfigV2 - serveConfig = strings.Replace(serveConfig, "price: 3", "price: 4", -1) - serveConfig = strings.Replace(serveConfig, "factor: 5", "factor: 3", -1) - rayService.Spec.ServeConfigV2 = serveConfig - _, err = test.Client().Ray().RayV1().RayServices(namespace.Name).Update( - test.Ctx(), - rayService, - metav1.UpdateOptions{}, - ) - g.Expect(err).NotTo(HaveOccurred()) + // Attempt to trigger incremental upgrade by updating RayService serve config and RayCluster spec + g.Eventually(func() error { + latestRayService, err := GetRayService(test, namespace.Name, rayServiceName) + if err != nil { + return err + } + latestRayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceCPU] = resource.MustParse("500m") + serveConfig := latestRayService.Spec.ServeConfigV2 + serveConfig = strings.Replace(serveConfig, "price: 3", "price: 4", -1) + serveConfig = strings.Replace(serveConfig, "factor: 5", "factor: 3", -1) + latestRayService.Spec.ServeConfigV2 = serveConfig + + _, err = test.Client().Ray().RayV1().RayServices(namespace.Name).Update( + test.Ctx(), + latestRayService, + metav1.UpdateOptions{}, + ) + return err + }, TestTimeoutShort).Should(Succeed(), "Failed to update RayService to trigger upgrade") LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be true", rayService.Namespace, rayService.Name) g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeTrue())) @@ -150,78 +154,39 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { }, TestTimeoutShort).Should(Succeed()) LogWithTimestamp(test.T(), "Validating stepwise traffic and capacity migration") - stepSizeVal := *stepSize - intervalVal := *interval - maxSurgeVal := *maxSurge - - var lastPendingCapacity, lastPendingTraffic, lastActiveCapacity, lastActiveTraffic int32 + intervalSeconds := *interval + var lastMigratedTime *metav1.Time // Validate expected behavior during an IncrementalUpgrade. The following checks ensures // that no requests are dropped throughout the upgrade process. - for { - // Wait IntervalSeconds in between updates - time.Sleep(time.Duration(intervalVal) * time.Second) - - // Fetch updated RayService - rayService, err := GetRayService(test, namespace.Name, rayServiceName) - g.Expect(err).NotTo(HaveOccurred()) + upgradeSteps := generateUpgradeSteps(*stepSize, *maxSurge) + for _, step := range upgradeSteps { + LogWithTimestamp(test.T(), "%s", step.name) + g.Eventually(func(g Gomega) int32 { + // Fetch updated RayService. + svc, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) + return step.getValue(svc) + }, TestTimeoutShort).Should(Equal(step.expectedValue)) - pending := rayService.Status.PendingServiceStatus - active := rayService.Status.ActiveServiceStatus + // Send a request to the RayService to validate no requests are dropped. + stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + g.Expect(stdout.String()).To(Or(Equal("6"), Equal("8")), "Response should be from the old or new app version during the upgrade") - if pending.RayClusterName == "" { - // No pending cluster - upgrade has completed - break - } + if strings.Contains(step.name, "pending traffic to shift") { + svc, err := GetRayService(test, namespace.Name, rayServiceName) + g.Expect(err).NotTo(HaveOccurred()) - // Incremental Upgrade related status fields should be set - g.Expect(pending.TrafficRoutedPercent).NotTo(BeNil()) - g.Expect(pending.TargetCapacity).NotTo(BeNil()) - g.Expect(active.TrafficRoutedPercent).NotTo(BeNil()) - g.Expect(active.TargetCapacity).NotTo(BeNil()) - - pendingTraffic := *pending.TrafficRoutedPercent - pendingCapacity := *pending.TargetCapacity - activeTraffic := *active.TrafficRoutedPercent - activeCapacity := *active.TargetCapacity - - LogWithTimestamp(test.T(), "pendingTraffic: %d, pendingCapacity: %d, activeTraffic: %d, activeCapacity: %d", pendingTraffic, pendingCapacity, activeTraffic, activeCapacity) - - // Initial iteration - set weights - if pendingTraffic == 0 && pendingCapacity == 0 && activeTraffic == 100 && activeCapacity == 100 { - lastPendingCapacity = pendingCapacity - lastPendingTraffic = pendingTraffic - lastActiveCapacity = activeCapacity - lastActiveTraffic = activeTraffic - continue - } + currentMigratedTime := svc.Status.PendingServiceStatus.LastTrafficMigratedTime + g.Expect(currentMigratedTime).NotTo(BeNil()) - // Validate that pending TargetCapacity increases by MaxSurgePercent - if pendingCapacity > lastPendingCapacity { - g.Expect(pendingCapacity - lastPendingCapacity).To(Equal(maxSurgeVal)) - lastPendingCapacity = pendingCapacity - } - - // Incremental traffic migration steps - if pendingTraffic < pendingCapacity { - if lastPendingTraffic != 0 { - g.Expect(pendingTraffic - lastPendingTraffic).To(Equal(stepSizeVal)) - g.Expect(lastActiveTraffic - activeTraffic).To(Equal(stepSizeVal)) + // Verify IntervalSeconds have passed since last TrafficRoutedPercent update. + if lastMigratedTime != nil { + duration := currentMigratedTime.Sub(lastMigratedTime.Time) + g.Expect(duration).To(BeNumerically(">=", intervalSeconds), + "Time between traffic steps should be >= IntervalSeconds") } - lastPendingTraffic = pendingTraffic - lastActiveTraffic = activeTraffic - continue - } - - // Once pending TrafficRoutedPercent equals TargetCapacity, active - // TargetCapacity can be reduced by MaxSurgePercent. - if pendingTraffic == pendingCapacity && activeCapacity > 0 { - rayService, err = GetRayService(test, namespace.Name, rayServiceName) - g.Expect(err).NotTo(HaveOccurred()) - newActiveCapacity := *rayService.Status.ActiveServiceStatus.TargetCapacity - g.Expect(lastActiveCapacity - newActiveCapacity).To(Equal(maxSurgeVal)) - lastActiveCapacity = newActiveCapacity - continue + lastMigratedTime = currentMigratedTime } } // Check that RayService completed upgrade diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go index 8cd59fb0df4..68c9e96460e 100644 --- a/ray-operator/test/e2eincrementalupgrade/support.go +++ b/ray-operator/test/e2eincrementalupgrade/support.go @@ -6,7 +6,9 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" corev1ac "k8s.io/client-go/applyconfigurations/core/v1" + "k8s.io/utils/ptr" gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" @@ -157,3 +159,87 @@ func GetGatewayIP(gateway *gwv1.Gateway) string { return "" } + +func GetPendingCapacity(rs *rayv1.RayService) int32 { + return ptr.Deref(rs.Status.PendingServiceStatus.TargetCapacity, 0) +} + +func GetPendingTraffic(rs *rayv1.RayService) int32 { + return ptr.Deref(rs.Status.PendingServiceStatus.TrafficRoutedPercent, 0) +} + +func GetActiveCapacity(rs *rayv1.RayService) int32 { + return ptr.Deref(rs.Status.ActiveServiceStatus.TargetCapacity, 100) +} + +func GetActiveTraffic(rs *rayv1.RayService) int32 { + return ptr.Deref(rs.Status.ActiveServiceStatus.TrafficRoutedPercent, 100) +} + +func GetLastTrafficMigratedTime(rs *rayv1.RayService) *metav1.Time { + return rs.Status.ActiveServiceStatus.LastTrafficMigratedTime +} + +// testStep defines a validation condition to wait for during the upgrade. +type testStep struct { + getValue func(rs *rayv1.RayService) int32 + name string + expectedValue int32 +} + +// generateUpgradeSteps is a helper function for testing that the controller follows the expected +// sequence of updates to TrafficRoutedPercent and TargetCapacity during an incremental upgrade. +func generateUpgradeSteps(stepSize, maxSurge int32) []testStep { + var steps []testStep + + pendingCapacity := int32(0) + pendingTraffic := int32(0) + activeCapacity := int32(100) + activeTraffic := int32(100) + + for pendingTraffic < 100 { + // Scale up the pending cluster's TargetCapacity. + if pendingTraffic == pendingCapacity { + nextPendingCapacity := min(pendingCapacity+maxSurge, 100) + if nextPendingCapacity > pendingCapacity { + steps = append(steps, testStep{ + name: fmt.Sprintf("Waiting for pending capacity to scale up to %d", nextPendingCapacity), + getValue: GetPendingCapacity, + expectedValue: nextPendingCapacity, + }) + pendingCapacity = nextPendingCapacity + } + } + + // Shift traffic over from the active to the pending cluster by StepSizePercent. + for pendingTraffic < pendingCapacity { + nextPendingTraffic := min(pendingTraffic+stepSize, 100) + steps = append(steps, testStep{ + name: fmt.Sprintf("Waiting for pending traffic to shift to %d", nextPendingTraffic), + getValue: GetPendingTraffic, + expectedValue: nextPendingTraffic, + }) + pendingTraffic = nextPendingTraffic + + nextActiveTraffic := max(activeTraffic-stepSize, 0) + steps = append(steps, testStep{ + name: fmt.Sprintf("Waiting for active traffic to shift down to %d", nextActiveTraffic), + getValue: GetActiveTraffic, + expectedValue: nextActiveTraffic, + }) + activeTraffic = nextActiveTraffic + } + + // Scale down the active cluster's target capacity. + nextActiveCapacity := max(activeCapacity-maxSurge, 0) + if nextActiveCapacity < activeCapacity { + steps = append(steps, testStep{ + name: fmt.Sprintf("Waiting for active capacity to scale down to %d", nextActiveCapacity), + getValue: GetActiveCapacity, + expectedValue: nextActiveCapacity, + }) + activeCapacity = nextActiveCapacity + } + } + return steps +} From acdcc8a7cc2d8eb16144b4705a51235de18671c2 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 3 Oct 2025 14:20:13 +0000 Subject: [PATCH 30/56] Use time &now Signed-off-by: Ryan O'Leary --- ray-operator/controllers/ray/rayservice_controller.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 5467bc29361..55002554fd0 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -595,8 +595,9 @@ func (r *RayServiceReconciler) reconcileTrafficRoutedPercent(ctx context.Context pendingClusterWeight = min(100, proposedPendingWeight, pendingClusterTargetCapacity) activeClusterWeight = 100 - pendingClusterWeight - pendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} - activeServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} + now := metav1.Time{Time: time.Now()} + pendingServiceStatus.LastTrafficMigratedTime = &now + activeServiceStatus.LastTrafficMigratedTime = &now } } From 44faa8e045cb4d8b59c3e5e6fd3f0e405f6cf149 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Fri, 3 Oct 2025 07:26:59 -0700 Subject: [PATCH 31/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 55002554fd0..c081bc40e96 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -633,11 +633,14 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn // Attempt to retrieve pending RayCluster pendingRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServicePendingRayClusterNamespacedName(rayServiceInstance)) - hasPendingCluster := (err == nil && pendingRayCluster != nil) - if err != nil && !errors.IsNotFound(err) { - logger.Info("Failed to retrieve pending RayCluster.") - return nil, err + pendingRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServicePendingRayClusterNamespacedName(rayServiceInstance)) + hasPendingCluster = false + if err != nil && !errors.IsNotFound(err){ + logger.Error(err, "Failed to retrieve pending RayCluster") + return nil, err + } + hasPendingCluster = pendingRayCluster != nil activeClusterWeight, pendingClusterWeight, err := r.reconcileTrafficRoutedPercent(ctx, rayServiceInstance, hasPendingCluster) if err != nil { From cbb1f25f727ee04731255db4f1ed074fb26928ef Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 3 Oct 2025 14:36:17 +0000 Subject: [PATCH 32/56] Add function comments Signed-off-by: Ryan O'Leary --- .../controllers/ray/rayservice_controller.go | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index c081bc40e96..3305b6c075d 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -483,6 +483,7 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra return true } +// `createGateway` creates a Gateway for a RayService or updates an existing Gateway. func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayService) (*gwv1.Gateway, error) { options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) if options == nil { @@ -556,8 +557,17 @@ func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceI return nil } -// reconcileTrafficRoutedPercent determines the traffic split between the active and pending clusters during an upgrade, -// returning the weights for the old and new clusters respectively, or an error if misconfigured. +// reconcileTrafficRoutedPercent determines the traffic split between the active and pending RayClusters. +// +// The function calculates the HTTPRoute weights for the active and pending RayClusters respectively, +// and updates the RayService status with new TrafficRoutedPercent values. +// +// The new weights are calculated using: +// - Current TrafficRoutedPercent values +// - Time-based migration using StepSizePercent and IntervalSeconds +// - TargetCapacity constraints +// +// Returns the active cluster traffic weight, pending cluster traffic weight, and an error if any. func (r *RayServiceReconciler) reconcileTrafficRoutedPercent(ctx context.Context, rayServiceInstance *rayv1.RayService, hasPendingCluster bool) (activeClusterWeight, pendingClusterWeight int32, err error) { logger := ctrl.LoggerFrom(ctx) activeServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus @@ -609,8 +619,15 @@ func (r *RayServiceReconciler) reconcileTrafficRoutedPercent(ctx context.Context return activeClusterWeight, pendingClusterWeight, nil } -// createHTTPRoute creates a desired HTTPRoute object based on a given RayService instance with -// weights based on TrafficRoutedPercent. +// createHTTPRoute creates a desired HTTPRoute object for RayService incremental upgrade. +// +// The function performs the following operations: +// 1. Retrieves Gateway instance to attach the HTTPRoute +// 2. Gets active and pending RayCluster instances and their Serve services +// 3. Calls `reconcileTrafficRoutedPercent` to calculate the new traffic weights +// 4. Configures HTTPRoute with appropriate backend references and weights +// +// Returns the configured HTTPRoute object or error if any step fails. func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.HTTPRoute, error) { logger := ctrl.LoggerFrom(ctx) From 3cd620fa5e54186173c97d96b3b84dc1e54214c5 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 3 Oct 2025 14:43:36 +0000 Subject: [PATCH 33/56] Fix bad merge Signed-off-by: Ryan O'Leary --- .../controllers/ray/rayservice_controller.go | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 3305b6c075d..53e409d2e90 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -346,9 +346,6 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn for _, subset := range serveEndPoints.Subsets { numServeEndpoints += len(subset.Addresses) } - if numServeEndpoints > math.MaxInt32 { - return errstd.New("numServeEndpoints exceeds math.MaxInt32") - } // During an IncrementalUpgrade, the pending RayCluster is also serving. if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && pendingCluster != nil { @@ -359,9 +356,10 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn for _, subset := range serveEndPoints.Subsets { numServeEndpoints += len(subset.Addresses) } - if numServeEndpoints > math.MaxInt32 { - return errstd.New("numServeEndpoints exceeds math.MaxInt32") - } + } + + if numServeEndpoints > math.MaxInt32 { + return errstd.New("numServeEndpoints exceeds math.MaxInt32") } rayServiceInstance.Status.NumServeEndpoints = int32(numServeEndpoints) //nolint:gosec // This is a false positive from gosec. See https://github.com/securego/gosec/issues/1212 for more details. @@ -650,14 +648,11 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn // Attempt to retrieve pending RayCluster pendingRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServicePendingRayClusterNamespacedName(rayServiceInstance)) - pendingRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServicePendingRayClusterNamespacedName(rayServiceInstance)) - hasPendingCluster = false - if err != nil && !errors.IsNotFound(err){ - logger.Error(err, "Failed to retrieve pending RayCluster") - return nil, err - + if err != nil && !errors.IsNotFound(err) { + logger.Error(err, "Failed to retrieve pending RayCluster.") + return nil, err } - hasPendingCluster = pendingRayCluster != nil + hasPendingCluster := pendingRayCluster != nil activeClusterWeight, pendingClusterWeight, err := r.reconcileTrafficRoutedPercent(ctx, rayServiceInstance, hasPendingCluster) if err != nil { From 64661fe462cab6bbad6e829d01e2d4e455b530f6 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 3 Oct 2025 14:55:02 +0000 Subject: [PATCH 34/56] Add more comments Signed-off-by: Ryan O'Leary --- ray-operator/controllers/ray/rayservice_controller.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 53e409d2e90..9794cc8fee7 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -315,14 +315,17 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn "clusterName", rayServiceInstance.Status.PendingServiceStatus.RayClusterName) if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { - // Set IncrementalUpgrade related Status fields for new pending RayCluster if enabled + // Set IncrementalUpgrade related Status fields for new pending RayCluster if enabled. if rayServiceInstance.Status.ActiveServiceStatus.RayClusterName == "" { // If no Active RayCluster exists - default to starting with 100% TargetCapacity. + // This is the case when a RayCluster is first starting for a RayService, so we should + // immediately scale it to full target capacity. if rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity == nil { rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(100)) } } else if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) { - // Pending RayCluster during an upgrade should start with 0% TargetCapacity. + // Pending RayCluster during an upgrade should start with 0% TargetCapacity, since + // traffic will be gradually migrated to the new cluster. if rayServiceInstance.Status.PendingServiceStatus.TargetCapacity == nil { rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(0)) } @@ -332,8 +335,9 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn serveEndPoints := &corev1.Endpoints{} serveServiceName := common.RayServiceServeServiceNamespacedName(rayServiceInstance) - // For IncrementalUpgrade, the Serve service name is based on the RayCluster. if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && activeCluster != nil { + // The Serve service name is based on the unique RayCluster name, since we use the + // per-cluster Serve services for traffic routing during an incremental upgrade. serveServiceName.Name = utils.GenerateServeServiceName(activeCluster.Name) } if err := r.Get(ctx, serveServiceName, serveEndPoints); err != nil && !errors.IsNotFound(err) { From 33060daec9ea2fc48d9e24f69a8efd6f7a6c8eef Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Sat, 4 Oct 2025 14:49:25 -0700 Subject: [PATCH 35/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 9794cc8fee7..0c5aaff5f9a 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -1139,7 +1139,7 @@ func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.C pendingRayServiceStatus := rayServiceInstance.Status.PendingServiceStatus if activeRayServiceStatus.RayClusterName == "" || pendingRayServiceStatus.RayClusterName == "" { - return false, "Both active and pending RayCluster instances required for incremental upgrade." + return false, "Both active and pending RayCluster instances are required for incremental upgrade." } // Validate Gateway and HTTPRoute objects are ready From e89c1b4c481c5c419e3af08b2aca17dacc9e5c87 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Sun, 5 Oct 2025 09:00:20 +0000 Subject: [PATCH 36/56] Add Ray Serve hostname and serve port logic Signed-off-by: Ryan O'Leary --- .../controllers/ray/common/service.go | 33 ++++++++ .../controllers/ray/common/service_test.go | 25 ++++++ .../controllers/ray/rayservice_controller.go | 47 +++++++---- .../ray/rayservice_controller_unit_test.go | 84 ++++++++++++++++--- .../controllers/ray/utils/constant.go | 4 + ray-operator/controllers/ray/utils/util.go | 14 ---- .../rayservice_incremental_upgrade_test.go | 19 +++-- .../test/e2eincrementalupgrade/support.go | 4 +- 8 files changed, 178 insertions(+), 52 deletions(-) diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go index 7675a30b3bb..b106cdb5848 100644 --- a/ray-operator/controllers/ray/common/service.go +++ b/ray-operator/controllers/ray/common/service.go @@ -10,6 +10,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" @@ -320,6 +321,38 @@ func BuildHeadlessServiceForRayCluster(rayCluster rayv1.RayCluster) *corev1.Serv return headlessService } +// GetServePort finds the container port named "serve" in the RayCluster's head group spec. +// It returns the default Ray Serve port 8000 if not explicitly defined. +func GetServePort(cluster *rayv1.RayCluster) gwv1.PortNumber { + if cluster == nil || len(cluster.Spec.HeadGroupSpec.Template.Spec.Containers) == 0 { + return utils.DefaultServingPort + } + + // Find the port named "serve" in the head group's container spec. + headContainer := cluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex] + for _, port := range headContainer.Ports { + if port.Name == utils.ServingPortName { + return gwv1.PortNumber(port.ContainerPort) + } + } + + return utils.DefaultServingPort +} + +// GetGatewayListenersForRayService constructs the default HTTP listener for a RayService Gateway. +func GetGatewayListenersForRayService(rayServiceInstance *rayv1.RayService) []gwv1.Listener { + hostname := fmt.Sprintf("%s.%s.svc.cluster.local", rayServiceInstance.Name, rayServiceInstance.Namespace) + + return []gwv1.Listener{ + { + Name: gwv1.SectionName(utils.GatewayListenerPortName), + Protocol: gwv1.HTTPProtocolType, + Port: utils.DefaultGatewayListenerPort, + Hostname: (*gwv1.Hostname)(&hostname), // backwards compatibility with Serve service + }, + } +} + func setServiceTypeForUserProvidedService(ctx context.Context, service *corev1.Service, defaultType corev1.ServiceType) { log := ctrl.LoggerFrom(ctx) // If the user has not specified a service type, use the default service type diff --git a/ray-operator/controllers/ray/common/service_test.go b/ray-operator/controllers/ray/common/service_test.go index bda8588a058..dd552f380b4 100644 --- a/ray-operator/controllers/ray/common/service_test.go +++ b/ray-operator/controllers/ray/common/service_test.go @@ -11,6 +11,7 @@ import ( "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" @@ -601,6 +602,30 @@ func TestUserSpecifiedServeService(t *testing.T) { validateNameAndNamespaceForUserSpecifiedService(svc, testRayServiceWithServeService.ObjectMeta.Namespace, userName, t) } +func TestGetGatewayListenersForRayService(t *testing.T) { + rayService := &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-ray-service", + Namespace: "test-ns", + }, + } + + listeners := GetGatewayListenersForRayService(rayService) + + // Validate expected Gateway HTTP listener is created. + require.Len(t, listeners, 1) + listener := listeners[0] + + assert.Equal(t, gwv1.SectionName(utils.GatewayListenerPortName), listener.Name) + assert.Equal(t, gwv1.HTTPProtocolType, listener.Protocol) + assert.Equal(t, gwv1.PortNumber(utils.DefaultGatewayListenerPort), listener.Port) + + // Verify hostname is created for compatibility with standard RayService Serve service endpoint. + expectedHostname := fmt.Sprintf("%s.%s.svc.cluster.local", rayService.Name, rayService.Namespace) + require.NotNil(t, listener.Hostname) + assert.Equal(t, expectedHostname, string(*listener.Hostname)) +} + func validateServiceTypeForUserSpecifiedService(svc *corev1.Service, userType corev1.ServiceType, t *testing.T) { // Test that the user service type takes priority over the default service type (example: ClusterIP) if svc.Spec.Type != userType { diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 0c5aaff5f9a..2e93ca3286b 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -504,7 +504,7 @@ func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayServic }, } - rayServiceGateway.Spec.Listeners = utils.GetGatewayListenersForRayService(rayServiceInstance) + rayServiceGateway.Spec.Listeners = common.GetGatewayListenersForRayService(rayServiceInstance) return rayServiceGateway, nil } @@ -559,10 +559,7 @@ func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceI return nil } -// reconcileTrafficRoutedPercent determines the traffic split between the active and pending RayClusters. -// -// The function calculates the HTTPRoute weights for the active and pending RayClusters respectively, -// and updates the RayService status with new TrafficRoutedPercent values. +// calculateTrafficRoutedPercent determines the HTTPRoute traffic split between the active and pending RayClusters. // // The new weights are calculated using: // - Current TrafficRoutedPercent values @@ -570,7 +567,7 @@ func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceI // - TargetCapacity constraints // // Returns the active cluster traffic weight, pending cluster traffic weight, and an error if any. -func (r *RayServiceReconciler) reconcileTrafficRoutedPercent(ctx context.Context, rayServiceInstance *rayv1.RayService, hasPendingCluster bool) (activeClusterWeight, pendingClusterWeight int32, err error) { +func (r *RayServiceReconciler) calculateTrafficRoutedPercent(ctx context.Context, rayServiceInstance *rayv1.RayService, isPendingClusterReady bool) (activeClusterWeight, pendingClusterWeight int32, err error) { logger := ctrl.LoggerFrom(ctx) activeServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus pendingServiceStatus := &rayServiceInstance.Status.PendingServiceStatus @@ -579,7 +576,7 @@ func (r *RayServiceReconciler) reconcileTrafficRoutedPercent(ctx context.Context activeClusterWeight = 100 pendingClusterWeight = 0 - if hasPendingCluster { + if isPendingClusterReady { // Zero-downtime upgrade in progress. options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) if options == nil { @@ -613,11 +610,6 @@ func (r *RayServiceReconciler) reconcileTrafficRoutedPercent(ctx context.Context } } - // Update the RayService status with the calculated traffic weights. - activeServiceStatus.TrafficRoutedPercent = ptr.To(activeClusterWeight) - pendingServiceStatus.TrafficRoutedPercent = ptr.To(pendingClusterWeight) - logger.Info("Updated TrafficRoutedPercent", "activeClusterWeight", activeClusterWeight, "pendingClusterWeight", pendingClusterWeight) - return activeClusterWeight, pendingClusterWeight, nil } @@ -626,8 +618,9 @@ func (r *RayServiceReconciler) reconcileTrafficRoutedPercent(ctx context.Context // The function performs the following operations: // 1. Retrieves Gateway instance to attach the HTTPRoute // 2. Gets active and pending RayCluster instances and their Serve services -// 3. Calls `reconcileTrafficRoutedPercent` to calculate the new traffic weights +// 3. Calls `calculateTrafficRoutedPercent` to calculate the new traffic weights // 4. Configures HTTPRoute with appropriate backend references and weights +// 5. Updates the active and pending RayServiceStatus.TrafficRoutedPercent based on the new weights. // // Returns the configured HTTPRoute object or error if any step fails. func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.HTTPRoute, error) { @@ -656,15 +649,26 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn logger.Error(err, "Failed to retrieve pending RayCluster.") return nil, err } - hasPendingCluster := pendingRayCluster != nil - activeClusterWeight, pendingClusterWeight, err := r.reconcileTrafficRoutedPercent(ctx, rayServiceInstance, hasPendingCluster) + isPendingClusterReady := false + if pendingRayCluster != nil { + isReady, err := r.isHeadPodRunningAndReady(ctx, pendingRayCluster) + if err != nil { + logger.Error(err, "Failed to check readiness of pending RayCluster's head pod.", "RayCluster", pendingRayCluster.Name) + } else if isReady { + isPendingClusterReady = true + logger.Info("Pending RayCluster is ready. Including it in HTTPRoute.", "RayCluster", pendingRayCluster.Name) + } + } + + activeClusterWeight, pendingClusterWeight, err := r.calculateTrafficRoutedPercent(ctx, rayServiceInstance, isPendingClusterReady) if err != nil { logger.Info("Failed to reconcile TrafficRoutedPercent for active and pending clusters.") return nil, err } activeClusterServeSvcName := utils.GenerateServeServiceName(activeRayCluster.Name) + activeServePort := common.GetServePort(activeRayCluster) backendRefs := []gwv1.HTTPBackendRef{ { @@ -672,26 +676,33 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn BackendObjectReference: gwv1.BackendObjectReference{ Name: gwv1.ObjectName(activeClusterServeSvcName), Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)), - Port: ptr.To(gwv1.PortNumber(8000)), + Port: ptr.To(activeServePort), }, Weight: ptr.To(activeClusterWeight), }, }, } + // Update the RayService status with the calculated traffic weights. + rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(activeClusterWeight) + logger.Info("Updated TrafficRoutedPercent", "activeClusterWeight", activeClusterWeight) - if hasPendingCluster { + if isPendingClusterReady { pendingClusterServeSvcName := utils.GenerateServeServiceName(pendingRayCluster.Name) + pendingServePort := common.GetServePort(pendingRayCluster) backendRefs = append(backendRefs, gwv1.HTTPBackendRef{ BackendRef: gwv1.BackendRef{ BackendObjectReference: gwv1.BackendObjectReference{ Name: gwv1.ObjectName(pendingClusterServeSvcName), Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)), - Port: ptr.To(gwv1.PortNumber(8000)), + Port: ptr.To(pendingServePort), }, Weight: ptr.To(pendingClusterWeight), }, }) + + rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent = ptr.To(pendingClusterWeight) + logger.Info("Updated TrafficRoutedPercent", "pendingClusterWeight", pendingClusterWeight) } httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s", gatewayInstance.Name)) diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 9076c184b13..999a74cdb65 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1450,6 +1450,29 @@ func TestCreateGateway(t *testing.T) { } } +// createReadyHeadPod is a helper function to create a running and ready head pod for a given RayCluster. +func createReadyHeadPod(clusterName, namespace string) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName + "-head-pod", + Namespace: namespace, + Labels: map[string]string{ + utils.RayClusterLabelKey: clusterName, + utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }, + }, + }, + } +} + func TestCreateHTTPRoute(t *testing.T) { ctx := context.TODO() namespace := "test-ns" @@ -1487,6 +1510,7 @@ func TestCreateHTTPRoute(t *testing.T) { }, }, } + readyHeadPod := createReadyHeadPod(pendingCluster.Name, namespace) tests := []struct { name string @@ -1496,12 +1520,21 @@ func TestCreateHTTPRoute(t *testing.T) { expectedActiveWeight int32 expectedPendingWeight int32 }{ + { + name: "Incremental upgrade, but pending cluster is not ready, so no traffic shift.", + modifier: func(rs *rayv1.RayService) { + rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} + }, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + expectedActiveWeight: 100, + expectedPendingWeight: 0, + }, { name: "Incremental upgrade, time since LastTrafficMigratedTime < IntervalSeconds.", modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} }, - runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService, readyHeadPod}, expectedActiveWeight: 100, expectedPendingWeight: 0, }, @@ -1511,7 +1544,7 @@ func TestCreateHTTPRoute(t *testing.T) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(60)) }, - runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService, readyHeadPod}, expectedActiveWeight: 90, expectedPendingWeight: 10, }, @@ -1521,7 +1554,7 @@ func TestCreateHTTPRoute(t *testing.T) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(5)) }, - runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService, readyHeadPod}, expectedActiveWeight: 95, expectedPendingWeight: 5, // can only migrate 5% to pending until TargetCapacity reached }, @@ -1530,7 +1563,7 @@ func TestCreateHTTPRoute(t *testing.T) { modifier: func(rs *rayv1.RayService) { rs.Spec.UpgradeStrategy.IncrementalUpgradeOptions = nil }, - runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService, readyHeadPod}, expectError: true, }, { @@ -1610,6 +1643,7 @@ func TestReconcileHTTPRoute(t *testing.T) { activeServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace}} pendingServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace}} gateway := &gwv1.Gateway{ObjectMeta: metav1.ObjectMeta{Name: gatewayName, Namespace: namespace}} + readyHeadPod := createReadyHeadPod(pendingCluster.Name, namespace) baseRayService := &rayv1.RayService{ ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace}, @@ -1643,14 +1677,29 @@ func TestReconcileHTTPRoute(t *testing.T) { name string expectedActiveWeight int32 expectedPendingWeight int32 + pendingClusterIsReady bool }{ + { + name: "Update HTTPRoute when pending cluster is ready.", + pendingClusterIsReady: true, + expectedActiveWeight: 70, + expectedPendingWeight: 30, + }, + { + name: "Do not split traffic when pending cluster is NOT ready.", + pendingClusterIsReady: false, + expectedActiveWeight: 100, + expectedPendingWeight: 0, + }, { name: "Create new HTTPRoute with weights.", + pendingClusterIsReady: true, expectedActiveWeight: 70, expectedPendingWeight: 30, }, { - name: "Existing HTTPRoute, time since LastTrafficMigratedTime >= IntervalSeconds so updates HTTPRoute.", + name: "Existing HTTPRoute, time since LastTrafficMigratedTime >= IntervalSeconds so updates HTTPRoute.", + pendingClusterIsReady: true, modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} }, @@ -1662,7 +1711,8 @@ func TestReconcileHTTPRoute(t *testing.T) { expectedPendingWeight: 30, }, { - name: "Existing HTTPRoute, time since LastTrafficMigratedTime < IntervalSeconds so no update.", + name: "Existing HTTPRoute, time since LastTrafficMigratedTime < IntervalSeconds so no update.", + pendingClusterIsReady: true, modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} }, @@ -1682,6 +1732,9 @@ func TestReconcileHTTPRoute(t *testing.T) { if tt.existingRoute != nil { runtimeObjects = append(runtimeObjects, tt.existingRoute) } + if tt.pendingClusterIsReady { + runtimeObjects = append(runtimeObjects, readyHeadPod) + } fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() reconciler := RayServiceReconciler{Client: fakeClient, Scheme: newScheme, Recorder: record.NewFakeRecorder(10)} @@ -1695,13 +1748,18 @@ func TestReconcileHTTPRoute(t *testing.T) { require.Len(t, reconciledRoute.Spec.Rules, 1) rule := reconciledRoute.Spec.Rules[0] - require.Len(t, rule.BackendRefs, 2) - - // Assert weights are set as expected. - assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight) - assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight) + if tt.pendingClusterIsReady { + require.Len(t, rule.BackendRefs, 2) + // Assert weights are set as expected. + assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight) + assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight) + } else { + require.Len(t, rule.BackendRefs, 1) - // Assert ParentRef namespace is now correctly set. + // Assert active weight is as expected. + assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight) + } + // Assert ParentRef namespace is correctly set. parent := reconciledRoute.Spec.ParentRefs[0] assert.Equal(t, gwv1.ObjectName(gatewayName), parent.Name) assert.Equal(t, ptr.To(gwv1.Namespace(namespace)), parent.Namespace) @@ -1952,7 +2010,7 @@ func TestCheckIfNeedIncrementalUpgradeUpdate(t *testing.T) { { name: "Missing RayClusterNames", expectedNeedsUpdate: false, - expectedReason: "Both active and pending RayCluster instances required for incremental upgrade.", + expectedReason: "Both active and pending RayCluster instances are required for incremental upgrade.", }, { name: "Gateway not ready", diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index ccdc967de86..1ece5858956 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -79,6 +79,10 @@ const ( MetricsPortName = "metrics" ServingPortName = "serve" + // Gateway defaults for HTTP protocol + GatewayListenerPortName = "http" + DefaultGatewayListenerPort = 80 + // The default AppProtocol for Kubernetes service DefaultServiceAppProtocol = "tcp" diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index e36c805c612..19259090e5a 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -780,20 +780,6 @@ func GetRayServiceIncrementalUpgradeOptions(spec *rayv1.RayServiceSpec) *rayv1.I return nil } -// addGatewayListenersForRayService is a helper function to returns Gateway Listeners -func GetGatewayListenersForRayService(rayServiceInstance *rayv1.RayService) []gwv1.Listener { - listeners := make([]gwv1.Listener, 0, 1) - listenerName := fmt.Sprintf("%s-listener", rayServiceInstance.Name) - listener := gwv1.Listener{ - Name: gwv1.SectionName(listenerName), - Protocol: gwv1.HTTPProtocolType, // only support HTTP - Port: gwv1.PortNumber(int32(80)), - } - listeners = append(listeners, listener) - - return listeners -} - // Check where we are running. We are trying to distinguish here whether // this is vanilla kubernetes cluster or Openshift func GetClusterType() bool { diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go index 219e4636e6f..7f6309a70d3 100644 --- a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -11,6 +11,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1" "github.com/ray-project/kuberay/ray-operator/pkg/features" @@ -94,10 +95,12 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { gatewayIP := GetGatewayIP(gateway) g.Expect(gatewayIP).NotTo(BeEmpty()) + hostname := fmt.Sprintf("%s.%s.svc.cluster.local", rayService.Name, rayService.Namespace) + LogWithTimestamp(test.T(), "Verifying RayService is serving traffic") - stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + stdout, _ := CurlRayServiceGateway(test, gatewayIP, hostname, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) g.Expect(stdout.String()).To(Equal("6")) - stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/calc", `["MUL", 3]`) + stdout, _ = CurlRayServiceGateway(test, gatewayIP, hostname, curlPod, curlContainerName, "/calc", `["MUL", 3]`) g.Expect(stdout.String()).To(Equal("15 pizzas please!")) // Attempt to trigger incremental upgrade by updating RayService serve config and RayCluster spec @@ -143,13 +146,17 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { g.Expect(err).NotTo(HaveOccurred(), "The serve service for the pending cluster should be created.") }, TestTimeoutShort).Should(Succeed()) - // Verify HTTPRoute is pointing to the correct two backends. + LogWithTimestamp(test.T(), "Waiting for pending RayCluster %s to have a ready head pod", pendingClusterName) + g.Eventually(RayCluster(test, namespace.Name, pendingClusterName), TestTimeoutMedium). + Should(WithTransform(StatusCondition(rayv1.HeadPodReady), MatchCondition(metav1.ConditionTrue, rayv1.HeadPodRunningAndReady))) + + // Wait for the HTTPRoute to reflect the two backends. + LogWithTimestamp(test.T(), "Waiting for HTTPRoute to have two backends") g.Eventually(func(g Gomega) { route, err := GetHTTPRoute(test, namespace.Name, httpRouteName) g.Expect(err).NotTo(HaveOccurred()) g.Expect(route.Spec.Rules).To(HaveLen(1)) g.Expect(route.Spec.Rules[0].BackendRefs).To(HaveLen(2)) - g.Expect(string(route.Spec.Rules[0].BackendRefs[0].Name)).To(Equal(activeServeSvcName)) g.Expect(string(route.Spec.Rules[0].BackendRefs[1].Name)).To(Equal(pendingServeSvcName)) }, TestTimeoutShort).Should(Succeed()) @@ -170,7 +177,7 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { }, TestTimeoutShort).Should(Equal(step.expectedValue)) // Send a request to the RayService to validate no requests are dropped. - stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + stdout, _ := CurlRayServiceGateway(test, gatewayIP, hostname, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) g.Expect(stdout.String()).To(Or(Equal("6"), Equal("8")), "Response should be from the old or new app version during the upgrade") if strings.Contains(step.name, "pending traffic to shift") { @@ -194,6 +201,6 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeFalse())) LogWithTimestamp(test.T(), "Verifying RayService uses updated ServeConfig after upgrade completes") - stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + stdout, _ = CurlRayServiceGateway(test, gatewayIP, hostname, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) g.Expect(stdout.String()).To(Equal("8")) } diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go index 68c9e96460e..4a55ecbfc24 100644 --- a/ray-operator/test/e2eincrementalupgrade/support.go +++ b/ray-operator/test/e2eincrementalupgrade/support.go @@ -20,6 +20,7 @@ import ( func CurlRayServiceGateway( t Test, gatewayIP string, + hostname string, curlPod *corev1.Pod, curlPodContainerName, rayServicePath, @@ -30,7 +31,8 @@ func CurlRayServiceGateway( "--max-time", "10", "-X", "POST", "-H", "Content-Type: application/json", - fmt.Sprintf("%s:80%s", gatewayIP, rayServicePath), + "-H", fmt.Sprintf("Host: %s", hostname), + fmt.Sprintf("http://%s%s", gatewayIP, rayServicePath), "-d", body, } From 023fd6cf150ba1201da03002b0b9e7b5a27cdf45 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Mon, 6 Oct 2025 22:01:49 -0700 Subject: [PATCH 37/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 2e93ca3286b..f1c479c2224 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -589,7 +589,7 @@ func (r *RayServiceReconciler) calculateTrafficRoutedPercent(ctx context.Context activeClusterWeight = ptr.Deref(activeServiceStatus.TrafficRoutedPercent, 100) if pendingClusterWeight == pendingClusterTargetCapacity { - // return without changing current traffic weights since cluster being migrated to is at capacity. + // Stop traffic migration because the pending cluster's current traffic weight has reached its target capacity limit. return activeClusterWeight, pendingClusterWeight, nil } From 629d0b646f54227c296df7f0a63912579828cd96 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Mon, 6 Oct 2025 22:02:11 -0700 Subject: [PATCH 38/56] Update ray-operator/controllers/ray/common/service.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/common/service.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go index b106cdb5848..540bc86c3e4 100644 --- a/ray-operator/controllers/ray/common/service.go +++ b/ray-operator/controllers/ray/common/service.go @@ -325,7 +325,7 @@ func BuildHeadlessServiceForRayCluster(rayCluster rayv1.RayCluster) *corev1.Serv // It returns the default Ray Serve port 8000 if not explicitly defined. func GetServePort(cluster *rayv1.RayCluster) gwv1.PortNumber { if cluster == nil || len(cluster.Spec.HeadGroupSpec.Template.Spec.Containers) == 0 { - return utils.DefaultServingPort + return gwv1.PortNumber(utils.DefaultServingPort) } // Find the port named "serve" in the head group's container spec. From 65156a664e1019b72f22040d34862c7dc958b7b3 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Mon, 6 Oct 2025 22:02:25 -0700 Subject: [PATCH 39/56] Update ray-operator/controllers/ray/common/service.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/common/service.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go index 540bc86c3e4..128e0c684ff 100644 --- a/ray-operator/controllers/ray/common/service.go +++ b/ray-operator/controllers/ray/common/service.go @@ -336,7 +336,7 @@ func GetServePort(cluster *rayv1.RayCluster) gwv1.PortNumber { } } - return utils.DefaultServingPort + return gwv1.PortNumber(utils.DefaultServingPort) } // GetGatewayListenersForRayService constructs the default HTTP listener for a RayService Gateway. From c19068b61c07b27e648941799439a4de95f45554 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Mon, 6 Oct 2025 22:03:04 -0700 Subject: [PATCH 40/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index f1c479c2224..7e1e6e8de7b 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -1181,10 +1181,12 @@ func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.C pendingTargetCapacity := int(*pendingRayServiceStatus.TargetCapacity) pendingTrafficRoutedPercent := int(*pendingRayServiceStatus.TrafficRoutedPercent) - if pendingTargetCapacity < 100 || pendingTrafficRoutedPercent < 100 { - return true, "Pending RayCluster has not finished scaling up." - } else if activeTargetCapacity == 0 && pendingTargetCapacity == 100 { + if activeTargetCapacity == 0 && pendingTargetCapacity == 100 { return false, "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete." + } else if pendingTargetCapacity < 100 || pendingTrafficRoutedPercent < 100 { + return true, "Pending RayCluster has not finished scaling up." + } + return true, "Active RayCluster TargetCapacity has not finished scaling down." } return true, "Active RayCluster TargetCapacity has not finished scaling down." } From 5d953a875d5a2b17423aacfa848137c4de19aa3c Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Mon, 6 Oct 2025 22:04:05 -0700 Subject: [PATCH 41/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- .../controllers/ray/rayservice_controller.go | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 7e1e6e8de7b..534801fa455 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -1145,6 +1145,29 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer // checkIfNeedIncrementalUpgradeUpdate returns whether the controller should adjust the target_capacity // of the Serve config associated with a RayCluster during an IncrementalUpgrade. +// +// This function implements the incremental upgrade state machine as defined in the design document: +// https://github.com/ray-project/enhancements/blob/main/reps/2024-12-4-ray-service-incr-upgrade.md +// +// The upgrade process follows these phases: +// 1. Phase 1 (Steps 7-8): New cluster scales up to target capacity +// - pendingTargetCapacity: 0% → 100% +// - Returns true: "Pending RayCluster has not finished scaling up." +// +// 2. Phase 2 (Step 9): Traffic gradually migrates to new cluster +// - pendingTrafficRoutedPercent: 0% → 100% +// - Returns true: "Pending RayCluster has not finished scaling up." +// +// 3. Phase 3 (Step 10): Old cluster scales down after new cluster is ready +// - activeTargetCapacity: 100% → 0% +// - Returns true: "Active RayCluster TargetCapacity has not finished scaling down." +// +// 4. Phase 4 (Step 11): Upgrade completion +// - Both clusters reach final state: active=0%, pending=100% +// - Returns false: "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete." +// +// The function ensures that traffic migration only proceeds when the target cluster has reached +// its capacity limit, preventing resource conflicts and ensuring upgrade stability. func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.Context, rayServiceInstance *rayv1.RayService) (bool, string) { activeRayServiceStatus := rayServiceInstance.Status.ActiveServiceStatus pendingRayServiceStatus := rayServiceInstance.Status.PendingServiceStatus From 3af80d686fbfa46ee72e05f90ff97134656b72c0 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Mon, 6 Oct 2025 22:04:24 -0700 Subject: [PATCH 42/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 534801fa455..8dd587cbec3 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -1266,9 +1266,6 @@ func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, logger := ctrl.LoggerFrom(ctx) logger.Info("reconcileServeTargetCapacity", "RayService", rayServiceInstance.Name) - if !utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { - return nil - } activeRayServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus pendingRayServiceStatus := &rayServiceInstance.Status.PendingServiceStatus From c68c4cf9375e585597c3416830699c38670f845d Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Mon, 6 Oct 2025 22:05:18 -0700 Subject: [PATCH 43/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 8dd587cbec3..36b8f13df80 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -1245,9 +1245,12 @@ func (r *RayServiceReconciler) applyServeTargetCapacity(ctx context.Context, ray } logger.Info("Applying new target_capacity to Ray cluster.", "goal", goalTargetCapacity) - if err := rayDashboardClient.UpdateDeployments(ctx, configJson); err != nil { - return fmt.Errorf("failed to update target_capacity for Serve applications: %w", err) - } + err = fmt.Errorf( + "fail to create / update Serve applications. If you observe this error consistently, "+ + "please check \"Issue 5: Fail to create / update Serve applications.\" in "+ + "https://docs.ray.io/en/master/cluster/kubernetes/troubleshooting/rayservice-troubleshooting.html#kuberay-raysvc-troubleshoot for more details. "+ + "err: %v", err) + return err // Update the status fields and cache new Serve config. if rayClusterInstance.Name == rayServiceInstance.Status.ActiveServiceStatus.RayClusterName { From 8e1ade4c3bbf67f381917a52cc637eda04bea542 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 9 Oct 2025 18:18:51 +0000 Subject: [PATCH 44/56] Fix dropped requests and old cluster config not being served Signed-off-by: Ryan O'Leary --- .../controllers/ray/rayservice_controller.go | 172 ++++++++++-------- .../ray/rayservice_controller_unit_test.go | 91 +++++---- .../rayservice_incremental_upgrade_test.go | 18 +- .../test/e2eincrementalupgrade/support.go | 6 +- 4 files changed, 158 insertions(+), 129 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 36b8f13df80..36fda0628e6 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -147,29 +147,6 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } - // Check if IncrementalUpgrade is enabled, if so reconcile Gateway objects. - if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { - // Ensure per-cluster Serve service exists for the active and pending RayClusters. - if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { - return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err - } - if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, pendingRayClusterInstance); err != nil { - return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err - } - // Creates or updates a Gateway CR that points to the Serve services of - // the active and pending (if it exists) RayClusters. For incremental upgrades, - // the Gateway endpoint is used rather than the Serve service. - err = r.reconcileGateway(ctx, rayServiceInstance) - if err != nil { - return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) - } - // Create or update the HTTPRoute attached to this RayService's Gateway. - err = r.reconcileHTTPRoute(ctx, rayServiceInstance) - if err != nil { - return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) - } - } - // Reconcile serve applications for active and/or pending clusters // 1. If there is a pending cluster, reconcile serve applications for the pending cluster. // 2. If there are both active and pending clusters, reconcile serve applications for the pending cluster only. @@ -192,20 +169,46 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } - } else if activeRayClusterInstance != nil && utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + } else if activeRayClusterInstance != nil && pendingRayClusterInstance != nil && utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { logger.Info("Reconciling the Serve applications for active cluster during IncrementalUpgrade", "clusterName", activeRayClusterInstance.Name) if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } } + // Check if IncrementalUpgrade is enabled, if so reconcile Gateway objects. + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + // Ensure per-cluster Serve service exists for the active and pending RayClusters. + if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } + if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, pendingRayClusterInstance); err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } + // Creates or updates a Gateway CR that points to the Serve services of + // the active and pending (if it exists) RayClusters. For incremental upgrades, + // the Gateway endpoint is used rather than the Serve service. + err = r.reconcileGateway(ctx, rayServiceInstance) + if err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) + } + // Create or update the HTTPRoute for the Gateway, passing in the pending cluster readiness status. + err = r.reconcileHTTPRoute(ctx, rayServiceInstance, isPendingClusterReady) + if err != nil { + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) + } + } + // Reconcile K8s services and make sure it points to the correct RayCluster. var headSvc, serveSvc *corev1.Service if isPendingClusterReady || isActiveClusterReady { targetCluster := activeRayClusterInstance logMsg := "Reconciling K8s services to point to the active Ray cluster." - if isPendingClusterReady { + isIncrementalUpgradeInProgress := utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) + if isPendingClusterReady && !isIncrementalUpgradeInProgress { + // This step is skipped for incremental upgrade, because the pending cluster is ready during the upgrade + // and creates its own per-cluster Serve service. targetCluster = pendingRayClusterInstance logMsg = "Reconciling K8s services to point to the pending Ray cluster to switch traffic because it is ready." } @@ -280,30 +283,49 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn rayServiceInstance.Status.PendingServiceStatus.Applications = pendingClusterServeApplications isPendingClusterServing := false + promotedPendingCluster := false if headSvc != nil && serveSvc != nil { pendingClusterName := rayServiceInstance.Status.PendingServiceStatus.RayClusterName activeClusterName := rayServiceInstance.Status.ActiveServiceStatus.RayClusterName - // Promote the pending cluster to the active cluster if both RayService's head and serve services - // have already pointed to the pending cluster. - clusterName := utils.GetRayClusterNameFromService(headSvc) - if clusterName != utils.GetRayClusterNameFromService(serveSvc) { - panic("headSvc and serveSvc are not pointing to the same cluster") - } - // Verify cluster name matches either pending or active cluster - if clusterName != pendingClusterName && clusterName != activeClusterName { - panic("clusterName is not equal to pendingCluster or activeCluster") + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) { + // An incremental upgrade is complete when the active cluster has 0% capacity and the pending cluster has + // 100% of the traffic. We can't promote the pending cluster until traffic has been fully migrated. + if pendingCluster != nil && + ptr.Deref(rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity, -1) == 0 && + ptr.Deref(rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent, -1) == 100 { + + logger.Info("Promoting pending cluster to active: Incremental upgrade complete.", + "oldCluster", rayServiceInstance.Status.ActiveServiceStatus.RayClusterName, + "newCluster", rayServiceInstance.Status.PendingServiceStatus.RayClusterName) + + rayServiceInstance.Status.ActiveServiceStatus = rayServiceInstance.Status.PendingServiceStatus + rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{} + promotedPendingCluster = true + } } - isPendingClusterServing = clusterName == pendingClusterName - - // If services point to a different cluster than the active one, promote pending to active - logger.Info("calculateStatus", "clusterSvcPointingTo", clusterName, "pendingClusterName", pendingClusterName, "activeClusterName", activeClusterName) - if activeClusterName != clusterName { - logger.Info("Promoting pending cluster to active", - "oldCluster", rayServiceInstance.Status.ActiveServiceStatus.RayClusterName, - "newCluster", clusterName) - rayServiceInstance.Status.ActiveServiceStatus = rayServiceInstance.Status.PendingServiceStatus - rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{} + if !utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) || !promotedPendingCluster { + // Promote the pending cluster to the active cluster if both RayService's head and serve services + // have already pointed to the pending cluster. + clusterName := utils.GetRayClusterNameFromService(headSvc) + if clusterName != utils.GetRayClusterNameFromService(serveSvc) { + panic("headSvc and serveSvc are not pointing to the same cluster") + } + // Verify cluster name matches either pending or active cluster + if clusterName != pendingClusterName && clusterName != activeClusterName { + panic("clusterName is not equal to pendingCluster or activeCluster") + } + isPendingClusterServing = clusterName == pendingClusterName + + // If services point to a different cluster than the active one, promote pending to active + logger.Info("calculateStatus", "clusterSvcPointingTo", clusterName, "pendingClusterName", pendingClusterName, "activeClusterName", activeClusterName) + if activeClusterName != clusterName { + logger.Info("Promoting pending cluster to active", + "oldCluster", rayServiceInstance.Status.ActiveServiceStatus.RayClusterName, + "newCluster", clusterName) + rayServiceInstance.Status.ActiveServiceStatus = rayServiceInstance.Status.PendingServiceStatus + rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{} + } } } @@ -623,7 +645,7 @@ func (r *RayServiceReconciler) calculateTrafficRoutedPercent(ctx context.Context // 5. Updates the active and pending RayServiceStatus.TrafficRoutedPercent based on the new weights. // // Returns the configured HTTPRoute object or error if any step fails. -func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) (*gwv1.HTTPRoute, error) { +func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService, isPendingClusterReady bool) (*gwv1.HTTPRoute, error) { logger := ctrl.LoggerFrom(ctx) // Retrieve Gateway instance to attach this HTTPRoute to. @@ -650,17 +672,6 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn return nil, err } - isPendingClusterReady := false - if pendingRayCluster != nil { - isReady, err := r.isHeadPodRunningAndReady(ctx, pendingRayCluster) - if err != nil { - logger.Error(err, "Failed to check readiness of pending RayCluster's head pod.", "RayCluster", pendingRayCluster.Name) - } else if isReady { - isPendingClusterReady = true - logger.Info("Pending RayCluster is ready. Including it in HTTPRoute.", "RayCluster", pendingRayCluster.Name) - } - } - activeClusterWeight, pendingClusterWeight, err := r.calculateTrafficRoutedPercent(ctx, rayServiceInstance, isPendingClusterReady) if err != nil { logger.Info("Failed to reconcile TrafficRoutedPercent for active and pending clusters.") @@ -686,7 +697,8 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(activeClusterWeight) logger.Info("Updated TrafficRoutedPercent", "activeClusterWeight", activeClusterWeight) - if isPendingClusterReady { + if pendingRayCluster != nil { + logger.Info("Pending RayCluster exists. Including it in HTTPRoute.", "RayCluster", pendingRayCluster.Name) pendingClusterServeSvcName := utils.GenerateServeServiceName(pendingRayCluster.Name) pendingServePort := common.GetServePort(pendingRayCluster) @@ -737,11 +749,11 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn } // reconcileHTTPRoute reconciles a HTTPRoute resource for a RayService to route traffic during an IncrementalUpgrade. -func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService) error { +func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService, isPendingClusterReady bool) error { logger := ctrl.LoggerFrom(ctx) var err error - desiredHTTPRoute, err := r.createHTTPRoute(ctx, rayServiceInstance) + desiredHTTPRoute, err := r.createHTTPRoute(ctx, rayServiceInstance, isPendingClusterReady) if err != nil { logger.Error(err, "Failed to build HTTPRoute for RayService upgrade") return err @@ -1151,20 +1163,20 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer // // The upgrade process follows these phases: // 1. Phase 1 (Steps 7-8): New cluster scales up to target capacity -// - pendingTargetCapacity: 0% → 100% -// - Returns true: "Pending RayCluster has not finished scaling up." +// - pendingTargetCapacity: 0% → 100% +// - Returns true: "Pending RayCluster has not finished scaling up." // // 2. Phase 2 (Step 9): Traffic gradually migrates to new cluster -// - pendingTrafficRoutedPercent: 0% → 100% -// - Returns true: "Pending RayCluster has not finished scaling up." +// - pendingTrafficRoutedPercent: 0% → 100% +// - Returns true: "Pending RayCluster has not finished scaling up." // // 3. Phase 3 (Step 10): Old cluster scales down after new cluster is ready -// - activeTargetCapacity: 100% → 0% -// - Returns true: "Active RayCluster TargetCapacity has not finished scaling down." +// - activeTargetCapacity: 100% → 0% +// - Returns true: "Active RayCluster TargetCapacity has not finished scaling down." // // 4. Phase 4 (Step 11): Upgrade completion -// - Both clusters reach final state: active=0%, pending=100% -// - Returns false: "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete." +// - Both clusters reach final state: active=0%, pending=100% +// - Returns false: "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete." // // The function ensures that traffic migration only proceeds when the target cluster has reached // its capacity limit, preventing resource conflicts and ensuring upgrade stability. @@ -1210,8 +1222,6 @@ func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.C return true, "Pending RayCluster has not finished scaling up." } return true, "Active RayCluster TargetCapacity has not finished scaling down." - } - return true, "Active RayCluster TargetCapacity has not finished scaling down." } // applyServeTargetCapacity updates the target_capacity for a given RayCluster's Serve applications. @@ -1245,12 +1255,14 @@ func (r *RayServiceReconciler) applyServeTargetCapacity(ctx context.Context, ray } logger.Info("Applying new target_capacity to Ray cluster.", "goal", goalTargetCapacity) + if err := rayDashboardClient.UpdateDeployments(ctx, configJson); err != nil { err = fmt.Errorf( "fail to create / update Serve applications. If you observe this error consistently, "+ "please check \"Issue 5: Fail to create / update Serve applications.\" in "+ "https://docs.ray.io/en/master/cluster/kubernetes/troubleshooting/rayservice-troubleshooting.html#kuberay-raysvc-troubleshoot for more details. "+ "err: %v", err) return err + } // Update the status fields and cache new Serve config. if rayClusterInstance.Name == rayServiceInstance.Status.ActiveServiceStatus.RayClusterName { @@ -1269,7 +1281,6 @@ func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, logger := ctrl.LoggerFrom(ctx) logger.Info("reconcileServeTargetCapacity", "RayService", rayServiceInstance.Name) - activeRayServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus pendingRayServiceStatus := &rayServiceInstance.Status.PendingServiceStatus @@ -1514,6 +1525,18 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns return false, serveApplications, err } + skipConfigUpdate := false + isActiveCluster := rayClusterInstance.Name == rayServiceInstance.Status.ActiveServiceStatus.RayClusterName + isIncrementalUpgradeInProgress := utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && + rayServiceInstance.Status.PendingServiceStatus.RayClusterName != "" + + if isActiveCluster && isIncrementalUpgradeInProgress { + // Skip updating the Serve config for the Active cluster during IncrementalUpgrade. The updated + // Serve config is applied to the pending RayService's RayCluster. + skipConfigUpdate = true + logger.Info("Blocking new Serve config submission for Active cluster during IncrementalUpgrade.", "clusterName", rayClusterInstance.Name) + } + cachedServeConfigV2 := r.getServeConfigFromCache(rayServiceInstance, rayClusterInstance.Name) isReady, serveApplications, err := getAndCheckServeStatus(ctx, rayDashboardClient) if err != nil { @@ -1522,14 +1545,14 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns shouldUpdate, reason := checkIfNeedSubmitServeApplications(cachedServeConfigV2, rayServiceInstance.Spec.ServeConfigV2, serveApplications) logger.Info("checkIfNeedSubmitServeApplications", "shouldUpdate", shouldUpdate, "reason", reason) - if shouldUpdate { + if shouldUpdate && !skipConfigUpdate { if err = r.updateServeDeployment(ctx, rayServiceInstance, rayDashboardClient, rayClusterInstance.Name); err != nil { r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateServeApplications), "Failed to update serve applications to the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, err) return false, serveApplications, err } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeApplications), "Updated serve applications to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name) } - if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) { + if isIncrementalUpgradeInProgress { incrementalUpgradeUpdate, reason := r.checkIfNeedIncrementalUpgradeUpdate(ctx, rayServiceInstance) logger.Info("checkIfNeedIncrementalUpgradeUpdate", "incrementalUpgradeUpdate", incrementalUpgradeUpdate, "reason", reason) if incrementalUpgradeUpdate { @@ -1539,11 +1562,6 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeTargetCapacity), "Updated target_capacity of serve applications to to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name) - - // Don't switch to the pending RayCluster until IncrementalUpgrade is complete. - if rayServiceInstance.Status.PendingServiceStatus.RayClusterName == rayClusterInstance.Name { - return false, serveApplications, nil - } } } diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 999a74cdb65..6d3833b3f72 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1450,29 +1450,6 @@ func TestCreateGateway(t *testing.T) { } } -// createReadyHeadPod is a helper function to create a running and ready head pod for a given RayCluster. -func createReadyHeadPod(clusterName, namespace string) *corev1.Pod { - return &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: clusterName + "-head-pod", - Namespace: namespace, - Labels: map[string]string{ - utils.RayClusterLabelKey: clusterName, - utils.RayNodeTypeLabelKey: string(rayv1.HeadNode), - }, - }, - Status: corev1.PodStatus{ - Phase: corev1.PodRunning, - Conditions: []corev1.PodCondition{ - { - Type: corev1.PodReady, - Status: corev1.ConditionTrue, - }, - }, - }, - } -} - func TestCreateHTTPRoute(t *testing.T) { ctx := context.TODO() namespace := "test-ns" @@ -1510,7 +1487,6 @@ func TestCreateHTTPRoute(t *testing.T) { }, }, } - readyHeadPod := createReadyHeadPod(pendingCluster.Name, namespace) tests := []struct { name string @@ -1519,6 +1495,7 @@ func TestCreateHTTPRoute(t *testing.T) { expectError bool expectedActiveWeight int32 expectedPendingWeight int32 + isPendingClusterReady bool }{ { name: "Incremental upgrade, but pending cluster is not ready, so no traffic shift.", @@ -1526,6 +1503,7 @@ func TestCreateHTTPRoute(t *testing.T) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} }, runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + isPendingClusterReady: false, expectedActiveWeight: 100, expectedPendingWeight: 0, }, @@ -1534,7 +1512,8 @@ func TestCreateHTTPRoute(t *testing.T) { modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} }, - runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService, readyHeadPod}, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + isPendingClusterReady: true, expectedActiveWeight: 100, expectedPendingWeight: 0, }, @@ -1544,7 +1523,8 @@ func TestCreateHTTPRoute(t *testing.T) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(60)) }, - runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService, readyHeadPod}, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + isPendingClusterReady: true, expectedActiveWeight: 90, expectedPendingWeight: 10, }, @@ -1554,7 +1534,8 @@ func TestCreateHTTPRoute(t *testing.T) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(5)) }, - runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService, readyHeadPod}, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + isPendingClusterReady: true, expectedActiveWeight: 95, expectedPendingWeight: 5, // can only migrate 5% to pending until TargetCapacity reached }, @@ -1563,8 +1544,9 @@ func TestCreateHTTPRoute(t *testing.T) { modifier: func(rs *rayv1.RayService) { rs.Spec.UpgradeStrategy.IncrementalUpgradeOptions = nil }, - runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService, readyHeadPod}, - expectError: true, + runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, + isPendingClusterReady: true, + expectError: true, }, { name: "No on-going upgrade, pending cluster does not exist.", @@ -1572,6 +1554,7 @@ func TestCreateHTTPRoute(t *testing.T) { rs.Status.PendingServiceStatus = rayv1.RayServiceStatus{} }, runtimeObjects: []runtime.Object{activeCluster, gateway, activeServeService}, + isPendingClusterReady: false, expectedActiveWeight: 100, expectedPendingWeight: 0, }, @@ -1595,7 +1578,7 @@ func TestCreateHTTPRoute(t *testing.T) { Recorder: record.NewFakeRecorder(1), } - route, err := reconciler.createHTTPRoute(ctx, rayService) + route, err := reconciler.createHTTPRoute(ctx, rayService, tt.isPendingClusterReady) if tt.expectError { require.Error(t, err) @@ -1643,7 +1626,6 @@ func TestReconcileHTTPRoute(t *testing.T) { activeServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace}} pendingServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace}} gateway := &gwv1.Gateway{ObjectMeta: metav1.ObjectMeta{Name: gatewayName, Namespace: namespace}} - readyHeadPod := createReadyHeadPod(pendingCluster.Name, namespace) baseRayService := &rayv1.RayService{ ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace}, @@ -1677,29 +1659,41 @@ func TestReconcileHTTPRoute(t *testing.T) { name string expectedActiveWeight int32 expectedPendingWeight int32 - pendingClusterIsReady bool + pendingClusterExists bool + isPendingClusterReady bool }{ { - name: "Update HTTPRoute when pending cluster is ready.", - pendingClusterIsReady: true, - expectedActiveWeight: 70, - expectedPendingWeight: 30, + name: "Create HTTPRoute with no pending cluster.", + isPendingClusterReady: false, + pendingClusterExists: false, + expectedActiveWeight: 100, + expectedPendingWeight: 0, }, { - name: "Do not split traffic when pending cluster is NOT ready.", - pendingClusterIsReady: false, + name: "Create HTTPRoute when pending cluster exists, but is not ready.", + isPendingClusterReady: false, + pendingClusterExists: true, expectedActiveWeight: 100, expectedPendingWeight: 0, }, { - name: "Create new HTTPRoute with weights.", - pendingClusterIsReady: true, + name: "Create new HTTPRoute with existing weights.", + isPendingClusterReady: true, + pendingClusterExists: true, + expectedActiveWeight: 70, + expectedPendingWeight: 30, + }, + { + name: "Update HTTPRoute when pending cluster is ready.", + isPendingClusterReady: true, + pendingClusterExists: true, expectedActiveWeight: 70, expectedPendingWeight: 30, }, { name: "Existing HTTPRoute, time since LastTrafficMigratedTime >= IntervalSeconds so updates HTTPRoute.", - pendingClusterIsReady: true, + isPendingClusterReady: true, + pendingClusterExists: true, modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} }, @@ -1712,7 +1706,8 @@ func TestReconcileHTTPRoute(t *testing.T) { }, { name: "Existing HTTPRoute, time since LastTrafficMigratedTime < IntervalSeconds so no update.", - pendingClusterIsReady: true, + isPendingClusterReady: true, + pendingClusterExists: true, modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} }, @@ -1728,18 +1723,19 @@ func TestReconcileHTTPRoute(t *testing.T) { tt.modifier(rayService) } + if !tt.pendingClusterExists { + rayService.Status.PendingServiceStatus.RayClusterName = "" + } + runtimeObjects := []runtime.Object{rayService, activeCluster, pendingCluster, gateway, activeServeService, pendingServeService} if tt.existingRoute != nil { runtimeObjects = append(runtimeObjects, tt.existingRoute) } - if tt.pendingClusterIsReady { - runtimeObjects = append(runtimeObjects, readyHeadPod) - } fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() reconciler := RayServiceReconciler{Client: fakeClient, Scheme: newScheme, Recorder: record.NewFakeRecorder(10)} - err := reconciler.reconcileHTTPRoute(ctx, rayService) + err := reconciler.reconcileHTTPRoute(ctx, rayService, tt.isPendingClusterReady) require.NoError(t, err) reconciledRoute := &gwv1.HTTPRoute{} @@ -1748,14 +1744,13 @@ func TestReconcileHTTPRoute(t *testing.T) { require.Len(t, reconciledRoute.Spec.Rules, 1) rule := reconciledRoute.Spec.Rules[0] - if tt.pendingClusterIsReady { + if tt.pendingClusterExists { require.Len(t, rule.BackendRefs, 2) // Assert weights are set as expected. assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight) assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight) } else { require.Len(t, rule.BackendRefs, 1) - // Assert active weight is as expected. assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight) } diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go index 7f6309a70d3..f1dbefdeb3a 100644 --- a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -163,6 +163,8 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { LogWithTimestamp(test.T(), "Validating stepwise traffic and capacity migration") intervalSeconds := *interval var lastMigratedTime *metav1.Time + oldVersionServed := false + newVersionServed := false // Validate expected behavior during an IncrementalUpgrade. The following checks ensures // that no requests are dropped throughout the upgrade process. @@ -176,9 +178,17 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { return step.getValue(svc) }, TestTimeoutShort).Should(Equal(step.expectedValue)) - // Send a request to the RayService to validate no requests are dropped. + // Send a request to the RayService to validate no requests are dropped. Check that + // both endpoints are serving requests. stdout, _ := CurlRayServiceGateway(test, gatewayIP, hostname, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) - g.Expect(stdout.String()).To(Or(Equal("6"), Equal("8")), "Response should be from the old or new app version during the upgrade") + response := stdout.String() + g.Expect(response).To(Or(Equal("6"), Equal("8")), "Response should be from the old or new app version during the upgrade") + if response == "6" { + oldVersionServed = true + } + if response == "8" { + newVersionServed = true + } if strings.Contains(step.name, "pending traffic to shift") { svc, err := GetRayService(test, namespace.Name, rayServiceName) @@ -196,6 +206,10 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { lastMigratedTime = currentMigratedTime } } + LogWithTimestamp(test.T(), "Verifying both old and new versions served traffic during the upgrade") + g.Expect(oldVersionServed).To(BeTrue(), "The old version of the service should have served traffic during the upgrade.") + g.Expect(newVersionServed).To(BeTrue(), "The new version of the service should have served traffic during the upgrade.") + // Check that RayService completed upgrade LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be false", rayService.Namespace, rayService.Name) g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeFalse())) diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go index 4a55ecbfc24..58e26146831 100644 --- a/ray-operator/test/e2eincrementalupgrade/support.go +++ b/ray-operator/test/e2eincrementalupgrade/support.go @@ -30,6 +30,7 @@ func CurlRayServiceGateway( "curl", "--max-time", "10", "-X", "POST", + "-H", "Connection: close", // avoid re-using the same connection for test "-H", "Content-Type: application/json", "-H", fmt.Sprintf("Host: %s", hostname), fmt.Sprintf("http://%s%s", gatewayIP, rayServicePath), @@ -232,9 +233,10 @@ func generateUpgradeSteps(stepSize, maxSurge int32) []testStep { activeTraffic = nextActiveTraffic } - // Scale down the active cluster's target capacity. + // Scale down the active cluster's target capacity. The final scale + // down is when the pending cluster is promoted to active. nextActiveCapacity := max(activeCapacity-maxSurge, 0) - if nextActiveCapacity < activeCapacity { + if nextActiveCapacity < activeCapacity && nextActiveCapacity > 0 { steps = append(steps, testStep{ name: fmt.Sprintf("Waiting for active capacity to scale down to %d", nextActiveCapacity), getValue: GetActiveCapacity, From f94b99691883ed3699c353ce939fe0402af65621 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 10 Oct 2025 20:58:29 +0000 Subject: [PATCH 45/56] Resolve readability comments and improve structure Signed-off-by: Ryan O'Leary --- .../controllers/ray/rayservice_controller.go | 42 +++++++++---------- .../controllers/ray/utils/constant.go | 2 + 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 36fda0628e6..1879db2a1b2 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -561,7 +561,7 @@ func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceI r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateGateway), "Failed to create Gateway for RayService %s/%s: %v", desiredGateway.Namespace, desiredGateway.Name, err) return err } - r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedRayCluster), "Created Gateway for RayService %s/%s", desiredGateway.Namespace, desiredGateway.Name) + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.CreatedGateway), "Created Gateway for RayService %s/%s", desiredGateway.Namespace, desiredGateway.Name) return nil } return err @@ -775,7 +775,7 @@ func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServic r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateHTTPRoute), "Failed to create the HTTPRoute for RayService %s/%s: %v", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name, err) return err } - r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.FailedToCreateHTTPRoute), "Created HTTPRoute for RayService %s/%s", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name) + r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.CreatedHTTPRoute), "Created HTTPRoute for RayService %s/%s", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name) return nil } return err @@ -1293,8 +1293,8 @@ func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, } // Retrieve the current observed Status fields for IncrementalUpgrade - activeTargetCapacity := ptr.Deref(activeRayServiceStatus.TargetCapacity, 100) - pendingTargetCapacity := ptr.Deref(pendingRayServiceStatus.TargetCapacity, 0) + activeTargetCapacity := *activeRayServiceStatus.TargetCapacity + pendingTargetCapacity := *pendingRayServiceStatus.TargetCapacity pendingTrafficRoutedPercent := ptr.Deref(pendingRayServiceStatus.TrafficRoutedPercent, 0) // Retrieve MaxSurgePercent - the maximum amount to change TargetCapacity by @@ -1317,26 +1317,26 @@ func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, // increase its target_capacity by MaxSurgePercent. // If the rayClusterInstance passed into this function is not the cluster to update based // on the above conditions, we return without doing anything. - var clusterName string var goalTargetCapacity int32 - if activeTargetCapacity+pendingTargetCapacity > int32(100) { - // Scale down the Active RayCluster TargetCapacity on this iteration. - goalTargetCapacity = max(int32(0), activeTargetCapacity-maxSurgePercent) - clusterName = activeRayServiceStatus.RayClusterName - if clusterName != rayClusterInstance.Name { - return nil + shouldUpdate := false + if rayClusterInstance.Name == activeRayServiceStatus.RayClusterName { + if activeTargetCapacity+pendingTargetCapacity > 100 { + // Scale down the Active RayCluster TargetCapacity on this iteration. + goalTargetCapacity = max(int32(0), activeTargetCapacity-maxSurgePercent) + shouldUpdate = true + logger.Info("Setting target_capacity for active Raycluster", "Raycluster", rayClusterInstance.Name, "target_capacity", goalTargetCapacity) } - activeRayServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) - logger.Info("Setting target_capacity for active Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity) - } else { - // Scale up the Pending RayCluster TargetCapacity on this iteration. - goalTargetCapacity = min(int32(100), pendingTargetCapacity+maxSurgePercent) - clusterName = pendingRayServiceStatus.RayClusterName - if clusterName != rayClusterInstance.Name { - return nil + } else if rayClusterInstance.Name == pendingRayServiceStatus.RayClusterName { + if activeTargetCapacity+pendingTargetCapacity <= 100 { + // Scale up the Pending RayCluster TargetCapacity on this iteration. + goalTargetCapacity = min(int32(100), pendingTargetCapacity+maxSurgePercent) + shouldUpdate = true + logger.Info("Setting target_capacity for pending Raycluster", "Raycluster", rayClusterInstance.Name, "target_capacity", goalTargetCapacity) } - pendingRayServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity) - logger.Info("Setting target_capacity for pending Raycluster", "Raycluster", clusterName, "target_capacity", goalTargetCapacity) + } + + if !shouldUpdate { + return nil } return r.applyServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient, goalTargetCapacity) diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index 1ece5858956..30ea756ae75 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -320,6 +320,8 @@ const ( RayClusterNotFound K8sEventType = "RayClusterNotFound" // RayService event list + CreatedGateway K8sEventType = "CreatedGateway" + CreatedHTTPRoute K8sEventType = "CreatedHTTPRoute" InvalidRayServiceSpec K8sEventType = "InvalidRayServiceSpec" InvalidRayServiceMetadata K8sEventType = "InvalidRayServiceMetadata" UpdatedHeadPodServeLabel K8sEventType = "UpdatedHeadPodServeLabel" From 7f20ecbfd405df0287e6401985dd15470e461da2 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 15 Oct 2025 11:04:27 +0000 Subject: [PATCH 46/56] Refactor based on comments Signed-off-by: Ryan O'Leary --- docs/reference/api.md | 21 +- .../crds/ray.io_rayservices.yaml | 2 +- ray-operator/apis/ray/v1/rayservice_types.go | 9 +- .../apis/ray/v1/zz_generated.deepcopy.go | 36 +--- .../config/crd/bases/ray.io_rayservices.yaml | 2 +- .../controllers/ray/rayservice_controller.go | 182 ++++++++++++------ .../ray/rayservice_controller_unit_test.go | 135 ++++++++++++- ray-operator/controllers/ray/utils/util.go | 44 ++++- .../controllers/ray/utils/util_test.go | 109 ++++++++++- .../controllers/ray/utils/validation.go | 10 +- .../controllers/ray/utils/validation_test.go | 6 +- ...adeoptions.go => clusterupgradeoptions.go} | 18 +- .../ray/v1/rayserviceupgradestrategy.go | 12 +- .../pkg/client/applyconfiguration/utils.go | 2 - .../test/e2eincrementalupgrade/support.go | 4 +- 15 files changed, 425 insertions(+), 167 deletions(-) rename ray-operator/pkg/client/applyconfiguration/ray/v1/{incrementalupgradeoptions.go => clusterupgradeoptions.go} (63%) diff --git a/docs/reference/api.md b/docs/reference/api.md index 305941e8c2f..d2eca7d97e9 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -199,25 +199,6 @@ _Appears in:_ | `serviceType` _[ServiceType](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#servicetype-v1-core)_ | ServiceType is Kubernetes service type of the head service. it will be used by the workers to connect to the head pod | | | -#### IncrementalUpgradeOptions - - - - - - - -_Appears in:_ -- [RayServiceUpgradeStrategy](#rayserviceupgradestrategy) - -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `maxSurgePercent` _integer_ | The capacity of serve requests the upgraded cluster should scale to handle each interval.
Defaults to 100%. | 100 | | -| `stepSizePercent` _integer_ | The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent. | | | -| `intervalSeconds` _integer_ | The interval in seconds between transferring StepSize traffic from the old to new RayCluster. | | | -| `gatewayClassName` _string_ | The name of the Gateway Class installed by the Kubernetes Cluster admin. | | | - - #### JobSubmissionMode @@ -396,7 +377,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | | -| `incrementalUpgradeOptions` _[IncrementalUpgradeOptions](#incrementalupgradeoptions)_ | IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade.
RayServiceIncrementalUpgrade feature gate must be enabled to set IncrementalUpgradeOptions. | | | +| `clusterUpgradeOptions` _[ClusterUpgradeOptions](#clusterupgradeoptions)_ | ClusterUpgradeOptions defines the behavior of an IncrementalUpgrade.
RayServiceIncrementalUpgrade feature gate must be enabled to set ClusterUpgradeOptions. | | | #### RayServiceUpgradeType diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml index 4dd8e3ea48e..267de9a20f8 100644 --- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml +++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml @@ -8241,7 +8241,7 @@ spec: type: integer upgradeStrategy: properties: - incrementalUpgradeOptions: + clusterUpgradeOptions: properties: gatewayClassName: type: string diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index ef11b80d2d6..9c599a882c3 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -60,7 +60,8 @@ var DeploymentStatusEnum = struct { UNHEALTHY: "UNHEALTHY", } -type IncrementalUpgradeOptions struct { +// These options are currently only supported for the IncrementalUpgrade type. +type ClusterUpgradeOptions struct { // The capacity of serve requests the upgraded cluster should scale to handle each interval. // Defaults to 100%. // +kubebuilder:default:=100 @@ -77,9 +78,9 @@ type RayServiceUpgradeStrategy struct { // Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. // +optional Type *RayServiceUpgradeType `json:"type,omitempty"` - // IncrementalUpgradeOptions defines the behavior of an IncrementalUpgrade. - // RayServiceIncrementalUpgrade feature gate must be enabled to set IncrementalUpgradeOptions. - IncrementalUpgradeOptions *IncrementalUpgradeOptions `json:"incrementalUpgradeOptions,omitempty"` + // ClusterUpgradeOptions defines the behavior of an IncrementalUpgrade. + // RayServiceIncrementalUpgrade feature gate must be enabled to set ClusterUpgradeOptions. + ClusterUpgradeOptions *ClusterUpgradeOptions `json:"clusterUpgradeOptions,omitempty"` } // RayServiceSpec defines the desired state of RayService diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index 80deee9db07..5ed52746d85 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -271,36 +271,6 @@ func (in *HeadInfo) DeepCopy() *HeadInfo { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *IncrementalUpgradeOptions) DeepCopyInto(out *IncrementalUpgradeOptions) { - *out = *in - if in.MaxSurgePercent != nil { - in, out := &in.MaxSurgePercent, &out.MaxSurgePercent - *out = new(int32) - **out = **in - } - if in.StepSizePercent != nil { - in, out := &in.StepSizePercent, &out.StepSizePercent - *out = new(int32) - **out = **in - } - if in.IntervalSeconds != nil { - in, out := &in.IntervalSeconds, &out.IntervalSeconds - *out = new(int32) - **out = **in - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IncrementalUpgradeOptions. -func (in *IncrementalUpgradeOptions) DeepCopy() *IncrementalUpgradeOptions { - if in == nil { - return nil - } - out := new(IncrementalUpgradeOptions) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RayCluster) DeepCopyInto(out *RayCluster) { *out = *in @@ -814,9 +784,9 @@ func (in *RayServiceUpgradeStrategy) DeepCopyInto(out *RayServiceUpgradeStrategy *out = new(RayServiceUpgradeType) **out = **in } - if in.IncrementalUpgradeOptions != nil { - in, out := &in.IncrementalUpgradeOptions, &out.IncrementalUpgradeOptions - *out = new(IncrementalUpgradeOptions) + if in.ClusterUpgradeOptions != nil { + in, out := &in.ClusterUpgradeOptions, &out.ClusterUpgradeOptions + *out = new(ClusterUpgradeOptions) (*in).DeepCopyInto(*out) } } diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml index 4dd8e3ea48e..267de9a20f8 100644 --- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml +++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml @@ -8241,7 +8241,7 @@ spec: type: integer upgradeStrategy: properties: - incrementalUpgradeOptions: + clusterUpgradeOptions: properties: gatewayClassName: type: string diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 1879db2a1b2..3739441d32c 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -262,6 +262,68 @@ func (r *RayServiceReconciler) reconcileServicesToReadyCluster(ctx context.Conte return headSvc, serveSvc, nil } +// promotePendingClusterToActive handles the logic for promoting the pending RayCluster to active in RayService status. +func (r *RayServiceReconciler) promotePendingClusterToActive(ctx context.Context, rayServiceInstance *rayv1.RayService) { + logger := ctrl.LoggerFrom(ctx) + logger.Info("Promoting pending cluster to active.", + "oldCluster", rayServiceInstance.Status.ActiveServiceStatus.RayClusterName, + "newCluster", rayServiceInstance.Status.PendingServiceStatus.RayClusterName) + + rayServiceInstance.Status.ActiveServiceStatus = rayServiceInstance.Status.PendingServiceStatus + rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{} +} + +// reconcilePromotionAndServingStatus handles the promotion logic after an upgrade, returning +// isPendingClusterServing: True if the main Kubernetes services are pointing to the pending cluster. +func (r *RayServiceReconciler) reconcilePromotionAndServingStatus(ctx context.Context, headSvc, serveSvc *corev1.Service, rayServiceInstance *rayv1.RayService, pendingCluster *rayv1.RayCluster) (isPendingClusterServing bool) { + logger := ctrl.LoggerFrom(ctx) + + // Step 1: Service Consistency Check. Ensure head and serve services point to the + // same cluster (active or pending). + clusterSvcPointsTo := utils.GetRayClusterNameFromService(headSvc) + if clusterSvcPointsTo != utils.GetRayClusterNameFromService(serveSvc) { + // This indicates a broken state that the controller cannot recover from automatically. + panic("headSvc and serveSvc are not pointing to the same cluster") + } + + // Step 2: Cluster Switching Logic. Determine which cluster the services are currently pointing to and + // determine if promotion should occur. + pendingClusterName := rayServiceInstance.Status.PendingServiceStatus.RayClusterName + activeClusterName := rayServiceInstance.Status.ActiveServiceStatus.RayClusterName + + // Verify that the service points to a known cluster (either active or pending). + if clusterSvcPointsTo != pendingClusterName && clusterSvcPointsTo != activeClusterName { + panic("clusterName from services is not equal to pendingCluster or activeCluster") + } + + var shouldPromote bool + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + // An incremental upgrade is complete when the active cluster has 0% capacity and the pending cluster has + // 100% of the traffic. We can't promote the pending cluster until traffic has been fully migrated. + if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) { + if utils.IsIncrementalUpgradeComplete(rayServiceInstance, pendingCluster) { + shouldPromote = true + logger.Info("Incremental upgrade completed, triggering promotion.", "rayService", rayServiceInstance.Name) + } + } else if activeClusterName == "" && pendingClusterName != "" { + // The Active cluster is empty when the RayCluster is first scaling up. + shouldPromote = true + } + } else { + // For traditional blue/green upgrade, promotion is complete when the Service selector has switched. + if activeClusterName != clusterSvcPointsTo { + shouldPromote = true + } + } + + // Step 3: Promote the pending cluster if prior conditions are met. + if shouldPromote { + r.promotePendingClusterToActive(ctx, rayServiceInstance) + } + + return (clusterSvcPointsTo == pendingClusterName) +} + func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceInstance *rayv1.RayService, headSvc, serveSvc *corev1.Service, activeCluster, pendingCluster *rayv1.RayCluster, activeClusterServeApplications, pendingClusterServeApplications map[string]rayv1.AppStatus) error { logger := ctrl.LoggerFrom(ctx) @@ -282,51 +344,40 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn rayServiceInstance.Status.ActiveServiceStatus.Applications = activeClusterServeApplications rayServiceInstance.Status.PendingServiceStatus.Applications = pendingClusterServeApplications - isPendingClusterServing := false - promotedPendingCluster := false + var isPendingClusterServing bool if headSvc != nil && serveSvc != nil { - pendingClusterName := rayServiceInstance.Status.PendingServiceStatus.RayClusterName - activeClusterName := rayServiceInstance.Status.ActiveServiceStatus.RayClusterName - - if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) { - // An incremental upgrade is complete when the active cluster has 0% capacity and the pending cluster has - // 100% of the traffic. We can't promote the pending cluster until traffic has been fully migrated. - if pendingCluster != nil && - ptr.Deref(rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity, -1) == 0 && - ptr.Deref(rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent, -1) == 100 { - - logger.Info("Promoting pending cluster to active: Incremental upgrade complete.", - "oldCluster", rayServiceInstance.Status.ActiveServiceStatus.RayClusterName, - "newCluster", rayServiceInstance.Status.PendingServiceStatus.RayClusterName) - - rayServiceInstance.Status.ActiveServiceStatus = rayServiceInstance.Status.PendingServiceStatus - rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{} - promotedPendingCluster = true - } - } - if !utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) || !promotedPendingCluster { - // Promote the pending cluster to the active cluster if both RayService's head and serve services - // have already pointed to the pending cluster. - clusterName := utils.GetRayClusterNameFromService(headSvc) - if clusterName != utils.GetRayClusterNameFromService(serveSvc) { - panic("headSvc and serveSvc are not pointing to the same cluster") + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + logger.Info("Processing incremental upgrade strategy.", "rayService", rayServiceInstance.Name) + oldActivePercent := ptr.Deref(rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent, -1) + oldPendingPercent := ptr.Deref(rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent, -1) + + // Update traffic weights for incremental upgrade by fetching the current HTTPRoute. + activeWeight, pendingWeight, err := r.getHTTPRouteTrafficWeights(ctx, rayServiceInstance) + if err != nil { + logger.Error(err, "Failed to get HTTPRoute traffic weights.") + return err } - // Verify cluster name matches either pending or active cluster - if clusterName != pendingClusterName && clusterName != activeClusterName { - panic("clusterName is not equal to pendingCluster or activeCluster") + + now := metav1.Time{Time: time.Now()} + if activeWeight >= 0 { + rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(activeWeight) + logger.Info("Updated active TrafficRoutedPercent from HTTPRoute", "activeClusterWeight", activeWeight) + if activeWeight != oldActivePercent { + rayServiceInstance.Status.ActiveServiceStatus.LastTrafficMigratedTime = &now + logger.Info("Updated LastTrafficMigratedTime of Active Service.") + } } - isPendingClusterServing = clusterName == pendingClusterName - - // If services point to a different cluster than the active one, promote pending to active - logger.Info("calculateStatus", "clusterSvcPointingTo", clusterName, "pendingClusterName", pendingClusterName, "activeClusterName", activeClusterName) - if activeClusterName != clusterName { - logger.Info("Promoting pending cluster to active", - "oldCluster", rayServiceInstance.Status.ActiveServiceStatus.RayClusterName, - "newCluster", clusterName) - rayServiceInstance.Status.ActiveServiceStatus = rayServiceInstance.Status.PendingServiceStatus - rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{} + if pendingWeight >= 0 { + rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent = ptr.To(pendingWeight) + logger.Info("Updated pending TrafficRoutedPercent from HTTPRoute", "pendingClusterWeight", pendingWeight) + if pendingWeight != oldPendingPercent { + rayServiceInstance.Status.PendingServiceStatus.LastTrafficMigratedTime = &now + logger.Info("Updated LastTrafficMigratedTime of Pending Service.") + } } } + // Reconcile serving status and promotion logic for all upgrade strategies. + isPendingClusterServing = r.reconcilePromotionAndServingStatus(ctx, headSvc, serveSvc, rayServiceInstance, pendingCluster) } if shouldPrepareNewCluster(ctx, rayServiceInstance, activeCluster, pendingCluster, isPendingClusterServing) { @@ -509,9 +560,9 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra // `createGateway` creates a Gateway for a RayService or updates an existing Gateway. func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayService) (*gwv1.Gateway, error) { - options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) + options := utils.GetRayServiceClusterUpgradeOptions(&rayServiceInstance.Spec) if options == nil { - return nil, errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade.") + return nil, errstd.New("Missing RayService ClusterUpgradeOptions during upgrade.") } gatewayName := utils.CheckGatewayName(rayServiceInstance.Name + "-gateway") @@ -600,9 +651,9 @@ func (r *RayServiceReconciler) calculateTrafficRoutedPercent(ctx context.Context if isPendingClusterReady { // Zero-downtime upgrade in progress. - options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) + options := utils.GetRayServiceClusterUpgradeOptions(&rayServiceInstance.Spec) if options == nil { - return 0, 0, errstd.New("IncrementalUpgradeOptions are not set during upgrade.") + return 0, 0, errstd.New("ClusterUpgradeOptions are not set during upgrade.") } // Check that target_capacity has been updated before migrating traffic. @@ -625,10 +676,6 @@ func (r *RayServiceReconciler) calculateTrafficRoutedPercent(ctx context.Context proposedPendingWeight := pendingClusterWeight + *options.StepSizePercent pendingClusterWeight = min(100, proposedPendingWeight, pendingClusterTargetCapacity) activeClusterWeight = 100 - pendingClusterWeight - - now := metav1.Time{Time: time.Now()} - pendingServiceStatus.LastTrafficMigratedTime = &now - activeServiceStatus.LastTrafficMigratedTime = &now } } @@ -642,7 +689,6 @@ func (r *RayServiceReconciler) calculateTrafficRoutedPercent(ctx context.Context // 2. Gets active and pending RayCluster instances and their Serve services // 3. Calls `calculateTrafficRoutedPercent` to calculate the new traffic weights // 4. Configures HTTPRoute with appropriate backend references and weights -// 5. Updates the active and pending RayServiceStatus.TrafficRoutedPercent based on the new weights. // // Returns the configured HTTPRoute object or error if any step fails. func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService, isPendingClusterReady bool) (*gwv1.HTTPRoute, error) { @@ -693,9 +739,6 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn }, }, } - // Update the RayService status with the calculated traffic weights. - rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(activeClusterWeight) - logger.Info("Updated TrafficRoutedPercent", "activeClusterWeight", activeClusterWeight) if pendingRayCluster != nil { logger.Info("Pending RayCluster exists. Including it in HTTPRoute.", "RayCluster", pendingRayCluster.Name) @@ -712,9 +755,6 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn Weight: ptr.To(pendingClusterWeight), }, }) - - rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent = ptr.To(pendingClusterWeight) - logger.Info("Updated TrafficRoutedPercent", "pendingClusterWeight", pendingClusterWeight) } httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s", gatewayInstance.Name)) @@ -795,6 +835,24 @@ func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServic return nil } +// getHTTPRouteTrafficWeights fetches the HTTPRoute associated with a RayService and returns +// the traffic weights for the active and pending clusters. +func (r *RayServiceReconciler) getHTTPRouteTrafficWeights(ctx context.Context, rayServiceInstance *rayv1.RayService) (activeWeight int32, pendingWeight int32, err error) { + activeWeight, pendingWeight = 100, 0 + + httpRoute := &gwv1.HTTPRoute{} + if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), httpRoute); err != nil { + if errors.IsNotFound(err) { + // If HTTPRoute doesn't exist yet, return the default weights. + return activeWeight, pendingWeight, nil + } + return 0, 0, err + } + activeWeight, pendingWeight = utils.GetWeightsFromHTTPRoute(httpRoute, rayServiceInstance) + + return activeWeight, pendingWeight, nil +} + // `reconcileRayCluster` reconciles the active and pending Ray clusters. There are 4 possible cases: // (1) Create a new pending cluster. (2) Update the active cluster. (3) Update the pending cluster. (4) Do nothing. func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServiceInstance *rayv1.RayService) (*rayv1.RayCluster, *rayv1.RayCluster, error) { @@ -1155,7 +1213,7 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer return nil } -// checkIfNeedIncrementalUpgradeUpdate returns whether the controller should adjust the target_capacity +// checkIfNeedTargetCapacityUpdate returns whether the controller should adjust the target_capacity // of the Serve config associated with a RayCluster during an IncrementalUpgrade. // // This function implements the incremental upgrade state machine as defined in the design document: @@ -1180,7 +1238,7 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer // // The function ensures that traffic migration only proceeds when the target cluster has reached // its capacity limit, preventing resource conflicts and ensuring upgrade stability. -func (r *RayServiceReconciler) checkIfNeedIncrementalUpgradeUpdate(ctx context.Context, rayServiceInstance *rayv1.RayService) (bool, string) { +func (r *RayServiceReconciler) checkIfNeedTargetCapacityUpdate(ctx context.Context, rayServiceInstance *rayv1.RayService) (bool, string) { activeRayServiceStatus := rayServiceInstance.Status.ActiveServiceStatus pendingRayServiceStatus := rayServiceInstance.Status.PendingServiceStatus @@ -1298,9 +1356,9 @@ func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, pendingTrafficRoutedPercent := ptr.Deref(pendingRayServiceStatus.TrafficRoutedPercent, 0) // Retrieve MaxSurgePercent - the maximum amount to change TargetCapacity by - options := utils.GetRayServiceIncrementalUpgradeOptions(&rayServiceInstance.Spec) + options := utils.GetRayServiceClusterUpgradeOptions(&rayServiceInstance.Spec) if options == nil { - return errstd.New("Missing RayService IncrementalUpgradeOptions during upgrade") + return errstd.New("Missing RayService ClusterUpgradeOptions during upgrade") } maxSurgePercent := ptr.Deref(options.MaxSurgePercent, 100) @@ -1528,7 +1586,7 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns skipConfigUpdate := false isActiveCluster := rayClusterInstance.Name == rayServiceInstance.Status.ActiveServiceStatus.RayClusterName isIncrementalUpgradeInProgress := utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && - rayServiceInstance.Status.PendingServiceStatus.RayClusterName != "" + meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) if isActiveCluster && isIncrementalUpgradeInProgress { // Skip updating the Serve config for the Active cluster during IncrementalUpgrade. The updated @@ -1553,8 +1611,8 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeApplications), "Updated serve applications to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name) } if isIncrementalUpgradeInProgress { - incrementalUpgradeUpdate, reason := r.checkIfNeedIncrementalUpgradeUpdate(ctx, rayServiceInstance) - logger.Info("checkIfNeedIncrementalUpgradeUpdate", "incrementalUpgradeUpdate", incrementalUpgradeUpdate, "reason", reason) + incrementalUpgradeUpdate, reason := r.checkIfNeedTargetCapacityUpdate(ctx, rayServiceInstance) + logger.Info("checkIfNeedTargetCapacityUpdate", "incrementalUpgradeUpdate", incrementalUpgradeUpdate, "reason", reason) if incrementalUpgradeUpdate { if err := r.reconcileServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient); err != nil { r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateTargetCapacity), "Failed to update target_capacity of serve applications to the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, err) diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 6d3833b3f72..bba28480d1a 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1352,7 +1352,7 @@ func makeIncrementalUpgradeRayService( if withOptions { spec.UpgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ Type: ptr.To(rayv1.IncrementalUpgrade), - IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{ GatewayClassName: gatewayClassName, StepSizePercent: stepSizePercent, IntervalSeconds: intervalSeconds, @@ -1426,7 +1426,7 @@ func TestCreateGateway(t *testing.T) { expectedListeners: 1, }, { - name: "missing IncrementalUpgradeOptions", + name: "missing ClusterUpgradeOptions", rayService: makeIncrementalUpgradeRayService(false, "gateway-class", ptr.To(int32(0)), ptr.To(int32(0)), ptr.To(int32(0)), &metav1.Time{Time: time.Now()}), expectErr: true, }, @@ -1467,7 +1467,7 @@ func TestCreateHTTPRoute(t *testing.T) { Spec: rayv1.RayServiceSpec{ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ Type: ptr.To(rayv1.IncrementalUpgrade), - IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{ StepSizePercent: &stepSize, IntervalSeconds: &interval, GatewayClassName: "istio", @@ -1540,9 +1540,9 @@ func TestCreateHTTPRoute(t *testing.T) { expectedPendingWeight: 5, // can only migrate 5% to pending until TargetCapacity reached }, { - name: "Create HTTPRoute called with missing IncrementalUpgradeOptions.", + name: "Create HTTPRoute called with missing ClusterUpgradeOptions.", modifier: func(rs *rayv1.RayService) { - rs.Spec.UpgradeStrategy.IncrementalUpgradeOptions = nil + rs.Spec.UpgradeStrategy.ClusterUpgradeOptions = nil }, runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}, isPendingClusterReady: true, @@ -1632,7 +1632,7 @@ func TestReconcileHTTPRoute(t *testing.T) { Spec: rayv1.RayServiceSpec{ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ Type: ptr.To(rayv1.IncrementalUpgrade), - IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{ StepSizePercent: &stepSize, IntervalSeconds: &interval, GatewayClassName: "istio", @@ -1875,7 +1875,7 @@ func TestReconcileServeTargetCapacity(t *testing.T) { Spec: rayv1.RayServiceSpec{ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ Type: ptr.To(rayv1.IncrementalUpgrade), - IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{ MaxSurgePercent: ptr.To(tt.maxSurgePercent), }, }, @@ -1988,7 +1988,7 @@ func makeHTTPRoute(name, namespace string, isReady bool) *gwv1.HTTPRoute { } } -func TestCheckIfNeedIncrementalUpgradeUpdate(t *testing.T) { +func TestCheckIfNeedTargetCapacityUpdate(t *testing.T) { rayServiceName := "test-rayservice" gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway") httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName) @@ -2085,7 +2085,7 @@ func TestCheckIfNeedIncrementalUpgradeUpdate(t *testing.T) { PendingServiceStatus: tt.pendingStatus, }, } - needsUpdate, reason := r.checkIfNeedIncrementalUpgradeUpdate(ctx, rayService) + needsUpdate, reason := r.checkIfNeedTargetCapacityUpdate(ctx, rayService) assert.Equal(t, tt.expectedNeedsUpdate, needsUpdate) assert.Equal(t, tt.expectedReason, reason) }) @@ -2230,3 +2230,120 @@ func TestReconcilePerClusterServeService(t *testing.T) { }) } } + +func TestGetHTTPRouteTrafficWeights(t *testing.T) { + namespace := "test-ns" + rayServiceName := "test-rayservice" + activeClusterName := "rayservice-active" + pendingClusterName := "rayservice-pending" + routeName := "httproute-test-rayservice-gateway" + + baseRayService := &rayv1.RayService{ + ObjectMeta: metav1.ObjectMeta{Name: rayServiceName, Namespace: namespace}, + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: rayv1.RayServiceStatus{RayClusterName: activeClusterName}, + PendingServiceStatus: rayv1.RayServiceStatus{RayClusterName: pendingClusterName}, + }, + } + + tests := []struct { + rayService *rayv1.RayService + httpRoute *gwv1.HTTPRoute + name string + expectedActiveWeight int32 + expectedPendingWeight int32 + expectError bool + }{ + { + name: "HTTPRoute does not exist", + rayService: baseRayService, + httpRoute: nil, + expectedActiveWeight: 100, + expectedPendingWeight: 0, + expectError: false, + }, + { + name: "HTTPRoute exists with active cluster backend", + rayService: baseRayService, + httpRoute: &gwv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{Name: routeName, Namespace: namespace}, + Spec: gwv1.HTTPRouteSpec{ + Rules: []gwv1.HTTPRouteRule{ + { + BackendRefs: []gwv1.HTTPBackendRef{ + { + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{Name: gwv1.ObjectName(utils.GenerateServeServiceName(activeClusterName))}, + Weight: ptr.To(int32(100)), + }, + }, + }, + }, + }, + }, + }, + expectedActiveWeight: 100, + expectedPendingWeight: -1, + expectError: false, + }, + { + name: "HTTPRoute exists with active and pending cluster backends", + rayService: baseRayService, + httpRoute: &gwv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{Name: routeName, Namespace: namespace}, + Spec: gwv1.HTTPRouteSpec{ + Rules: []gwv1.HTTPRouteRule{ + { + BackendRefs: []gwv1.HTTPBackendRef{ + { + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{Name: gwv1.ObjectName(utils.GenerateServeServiceName(activeClusterName))}, + Weight: ptr.To(int32(80)), + }, + }, + { + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{Name: gwv1.ObjectName(utils.GenerateServeServiceName(pendingClusterName))}, + Weight: ptr.To(int32(20)), + }, + }, + }, + }, + }, + }, + }, + expectedActiveWeight: 80, + expectedPendingWeight: 20, + expectError: false, + }, + } + + newScheme := runtime.NewScheme() + _ = rayv1.AddToScheme(newScheme) + _ = gwv1.AddToScheme(newScheme) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + runtimeObjects := []runtime.Object{tt.rayService} + if tt.httpRoute != nil { + tt.httpRoute.Name = utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s-gateway", tt.rayService.Name)) + runtimeObjects = append(runtimeObjects, tt.httpRoute) + } + fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() + + r := RayServiceReconciler{Client: fakeClient} + ctx := context.TODO() + + // Validates retried weights match what is expected. + activeWeight, pendingWeight, err := r.getHTTPRouteTrafficWeights(ctx, tt.rayService) + + if tt.expectError { + require.Error(t, err) + } else { + require.NoError(t, err) + assert.Equal(t, tt.expectedActiveWeight, activeWeight) + assert.Equal(t, tt.expectedPendingWeight, pendingWeight) + } + }) + } +} diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 19259090e5a..98f4c726736 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -21,6 +21,7 @@ import ( "k8s.io/apimachinery/pkg/util/json" "k8s.io/apimachinery/pkg/util/rand" "k8s.io/client-go/discovery" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -773,13 +774,52 @@ func IsIncrementalUpgradeEnabled(spec *rayv1.RayServiceSpec) bool { *spec.UpgradeStrategy.Type == rayv1.IncrementalUpgrade } -func GetRayServiceIncrementalUpgradeOptions(spec *rayv1.RayServiceSpec) *rayv1.IncrementalUpgradeOptions { +func GetRayServiceClusterUpgradeOptions(spec *rayv1.RayServiceSpec) *rayv1.ClusterUpgradeOptions { if spec != nil && spec.UpgradeStrategy != nil { - return spec.UpgradeStrategy.IncrementalUpgradeOptions + return spec.UpgradeStrategy.ClusterUpgradeOptions } return nil } +// IsIncrementalUpgradeComplete checks if the conditions for completing an incremental upgrade are met. +func IsIncrementalUpgradeComplete(rayServiceInstance *rayv1.RayService, pendingCluster *rayv1.RayCluster) bool { + return pendingCluster != nil && + ptr.Deref(rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity, -1) == 0 && + ptr.Deref(rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent, -1) == 100 +} + +// GetWeightsFromHTTPRoute parses a given HTTPRoute object and extracts the traffic weights +// for the active and pending clusters (if present) of a RayService. +func GetWeightsFromHTTPRoute(httpRoute *gwv1.HTTPRoute, rayServiceInstance *rayv1.RayService) (activeWeight int32, pendingWeight int32) { + var activeClusterName, pendingClusterName string + if rayServiceInstance != nil { + activeClusterName = rayServiceInstance.Status.ActiveServiceStatus.RayClusterName + pendingClusterName = rayServiceInstance.Status.PendingServiceStatus.RayClusterName + } + + // Defaults if weights can't be detected. + activeWeight = -1 + pendingWeight = -1 + + if httpRoute == nil || len(httpRoute.Spec.Rules) == 0 || len(httpRoute.Spec.Rules[0].BackendRefs) == 0 { + return + } + + for _, backendRef := range httpRoute.Spec.Rules[0].BackendRefs { + backendName := string(backendRef.Name) + weight := ptr.Deref(backendRef.Weight, 0) + + if activeClusterName != "" && strings.Contains(backendName, activeClusterName) { + activeWeight = weight + } + if pendingClusterName != "" && strings.Contains(backendName, pendingClusterName) { + pendingWeight = weight + } + } + + return +} + // Check where we are running. We are trying to distinguish here whether // this is vanilla kubernetes cluster or Openshift func GetClusterType() bool { diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index 2d87c12ac46..b50268b534e 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -1442,29 +1442,29 @@ func TestIsIncrementalUpgradeEnabled(t *testing.T) { } } -func TestGetRayServiceIncrementalUpgradeOptions(t *testing.T) { - upgradeOptions := &rayv1.IncrementalUpgradeOptions{GatewayClassName: "gateway-class"} +func TestGetRayServiceClusterUpgradeOptions(t *testing.T) { + upgradeOptions := &rayv1.ClusterUpgradeOptions{GatewayClassName: "gateway-class"} tests := []struct { rayServiceSpec *rayv1.RayServiceSpec - expectedOptions *rayv1.IncrementalUpgradeOptions + expectedOptions *rayv1.ClusterUpgradeOptions name string }{ { - name: "RayServiceSpec is nil, return nil IncrementalUpgradeOptions", + name: "RayServiceSpec is nil, return nil ClusterUpgradeOptions", rayServiceSpec: nil, expectedOptions: nil, }, { - name: "UpgradeStrategy is nil, return nil IncrementalUpgradeOptions", + name: "UpgradeStrategy is nil, return nil ClusterUpgradeOptions", rayServiceSpec: &rayv1.RayServiceSpec{}, expectedOptions: nil, }, { - name: "Valid IncrementalUpgradeOptions", + name: "Valid ClusterUpgradeOptions", rayServiceSpec: &rayv1.RayServiceSpec{ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ - IncrementalUpgradeOptions: upgradeOptions, + ClusterUpgradeOptions: upgradeOptions, }, }, expectedOptions: upgradeOptions, @@ -1473,7 +1473,7 @@ func TestGetRayServiceIncrementalUpgradeOptions(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - actualOptions := GetRayServiceIncrementalUpgradeOptions(tt.rayServiceSpec) + actualOptions := GetRayServiceClusterUpgradeOptions(tt.rayServiceSpec) assert.Equal(t, tt.expectedOptions, actualOptions) }) } @@ -1522,3 +1522,96 @@ func TestGetContainerCommand(t *testing.T) { }) } } + +func TestGetWeightsFromHTTPRoute(t *testing.T) { + activeClusterName := "rayservice-active" + pendingClusterName := "rayservice-pending" + + // Helper to create a RayService with specified cluster names in its status. + makeRayService := func(activeName, pendingName string) *rayv1.RayService { + return &rayv1.RayService{ + Status: rayv1.RayServiceStatuses{ + ActiveServiceStatus: rayv1.RayServiceStatus{RayClusterName: activeName}, + PendingServiceStatus: rayv1.RayServiceStatus{RayClusterName: pendingName}, + }, + } + } + + // Helper to create an HTTPRoute with specified backend weights. + makeHTTPRoute := func(activeWeight, pendingWeight *int32) *gwv1.HTTPRoute { + backends := []gwv1.HTTPBackendRef{} + if activeWeight != nil { + backends = append(backends, gwv1.HTTPBackendRef{ + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{Name: gwv1.ObjectName(GenerateServeServiceName(activeClusterName))}, + Weight: activeWeight, + }, + }) + } + if pendingWeight != nil { + backends = append(backends, gwv1.HTTPBackendRef{ + BackendRef: gwv1.BackendRef{ + BackendObjectReference: gwv1.BackendObjectReference{Name: gwv1.ObjectName(GenerateServeServiceName(pendingClusterName))}, + Weight: pendingWeight, + }, + }) + } + return &gwv1.HTTPRoute{ + Spec: gwv1.HTTPRouteSpec{ + Rules: []gwv1.HTTPRouteRule{{BackendRefs: backends}}, + }, + } + } + + tests := []struct { + httpRoute *gwv1.HTTPRoute + rayService *rayv1.RayService + name string + expectedActive int32 + expectedPending int32 + }{ + { + name: "No HTTPRoute, return defaults for both weights", + httpRoute: nil, + rayService: makeRayService(activeClusterName, ""), + expectedActive: -1, + expectedPending: -1, + }, + { + name: "HTTPRoute with missing backends, return defaults for both weights", + httpRoute: &gwv1.HTTPRoute{Spec: gwv1.HTTPRouteSpec{Rules: []gwv1.HTTPRouteRule{{}}}}, + rayService: makeRayService(activeClusterName, pendingClusterName), + expectedActive: -1, + expectedPending: -1, + }, + { + name: "Valid weights returned for both active and pending clusters", + httpRoute: makeHTTPRoute(ptr.To(int32(80)), ptr.To(int32(20))), + rayService: makeRayService(activeClusterName, pendingClusterName), + expectedActive: 80, + expectedPending: 20, + }, + { + name: "Valid HTTPRoute with only active cluster backend", + httpRoute: makeHTTPRoute(ptr.To(int32(100)), nil), + rayService: makeRayService(activeClusterName, ""), + expectedActive: 100, + expectedPending: -1, + }, + { + name: "Valid HTTPRoute with only pending cluster backend", + httpRoute: makeHTTPRoute(nil, ptr.To(int32(100))), + rayService: makeRayService("", pendingClusterName), + expectedActive: -1, + expectedPending: 100, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + active, pending := GetWeightsFromHTTPRoute(tt.httpRoute, tt.rayService) + assert.Equal(t, tt.expectedActive, active, "Active weight mismatch") + assert.Equal(t, tt.expectedPending, pending, "Pending weight mismatch") + }) + } +} diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index e80201afdfe..5a419c22a23 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -320,22 +320,22 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { return fmt.Errorf("Spec.RayClusterDeletionDelaySeconds should be a non-negative integer, got %d", *rayService.Spec.RayClusterDeletionDelaySeconds) } - // If type is IncrementalUpgrade, validate the IncrementalUpgradeOptions + // If type is IncrementalUpgrade, validate the ClusterUpgradeOptions if IsIncrementalUpgradeEnabled(&rayService.Spec) { - return ValidateIncrementalUpgradeOptions(rayService) + return ValidateClusterUpgradeOptions(rayService) } return nil } -func ValidateIncrementalUpgradeOptions(rayService *rayv1.RayService) error { +func ValidateClusterUpgradeOptions(rayService *rayv1.RayService) error { if !IsAutoscalingEnabled(&rayService.Spec.RayClusterSpec) { return fmt.Errorf("Ray Autoscaler is required for IncrementalUpgrade") } - options := rayService.Spec.UpgradeStrategy.IncrementalUpgradeOptions + options := rayService.Spec.UpgradeStrategy.ClusterUpgradeOptions if options == nil { - return fmt.Errorf("IncrementalUpgradeOptions are required for IncrementalUpgrade") + return fmt.Errorf("ClusterUpgradeOptions are required for IncrementalUpgrade") } // MaxSurgePercent defaults to 100% if unset. diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index ee346b8b6c1..bd47fd773b9 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1665,7 +1665,7 @@ func createBasicRayClusterSpec() *rayv1.RayClusterSpec { } } -func TestValidateIncrementalUpgradeOptions(t *testing.T) { +func TestValidateClusterUpgradeOptions(t *testing.T) { tests := []struct { maxSurgePercent *int32 stepSizePercent *int32 @@ -1741,7 +1741,7 @@ func TestValidateIncrementalUpgradeOptions(t *testing.T) { if tt.maxSurgePercent != nil || tt.stepSizePercent != nil || tt.intervalSeconds != nil || tt.gatewayClassName != "" { upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ Type: ptr.To(rayv1.IncrementalUpgrade), - IncrementalUpgradeOptions: &rayv1.IncrementalUpgradeOptions{ + ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{ MaxSurgePercent: tt.maxSurgePercent, StepSizePercent: tt.stepSizePercent, IntervalSeconds: tt.intervalSeconds, @@ -1764,7 +1764,7 @@ func TestValidateIncrementalUpgradeOptions(t *testing.T) { }, } - err := ValidateIncrementalUpgradeOptions(rayService) + err := ValidateClusterUpgradeOptions(rayService) if tt.expectError { require.Error(t, err, tt.name) } else { diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/clusterupgradeoptions.go similarity index 63% rename from ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go rename to ray-operator/pkg/client/applyconfiguration/ray/v1/clusterupgradeoptions.go index a736a964cdb..1e43d339716 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/incrementalupgradeoptions.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/clusterupgradeoptions.go @@ -2,25 +2,25 @@ package v1 -// IncrementalUpgradeOptionsApplyConfiguration represents a declarative configuration of the IncrementalUpgradeOptions type for use +// ClusterUpgradeOptionsApplyConfiguration represents a declarative configuration of the ClusterUpgradeOptions type for use // with apply. -type IncrementalUpgradeOptionsApplyConfiguration struct { +type ClusterUpgradeOptionsApplyConfiguration struct { MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"` StepSizePercent *int32 `json:"stepSizePercent,omitempty"` IntervalSeconds *int32 `json:"intervalSeconds,omitempty"` GatewayClassName *string `json:"gatewayClassName,omitempty"` } -// IncrementalUpgradeOptionsApplyConfiguration constructs a declarative configuration of the IncrementalUpgradeOptions type for use with +// ClusterUpgradeOptionsApplyConfiguration constructs a declarative configuration of the ClusterUpgradeOptions type for use with // apply. -func IncrementalUpgradeOptions() *IncrementalUpgradeOptionsApplyConfiguration { - return &IncrementalUpgradeOptionsApplyConfiguration{} +func ClusterUpgradeOptions() *ClusterUpgradeOptionsApplyConfiguration { + return &ClusterUpgradeOptionsApplyConfiguration{} } // WithMaxSurgePercent sets the MaxSurgePercent field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the MaxSurgePercent field is set to the value of the last call. -func (b *IncrementalUpgradeOptionsApplyConfiguration) WithMaxSurgePercent(value int32) *IncrementalUpgradeOptionsApplyConfiguration { +func (b *ClusterUpgradeOptionsApplyConfiguration) WithMaxSurgePercent(value int32) *ClusterUpgradeOptionsApplyConfiguration { b.MaxSurgePercent = &value return b } @@ -28,7 +28,7 @@ func (b *IncrementalUpgradeOptionsApplyConfiguration) WithMaxSurgePercent(value // WithStepSizePercent sets the StepSizePercent field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the StepSizePercent field is set to the value of the last call. -func (b *IncrementalUpgradeOptionsApplyConfiguration) WithStepSizePercent(value int32) *IncrementalUpgradeOptionsApplyConfiguration { +func (b *ClusterUpgradeOptionsApplyConfiguration) WithStepSizePercent(value int32) *ClusterUpgradeOptionsApplyConfiguration { b.StepSizePercent = &value return b } @@ -36,7 +36,7 @@ func (b *IncrementalUpgradeOptionsApplyConfiguration) WithStepSizePercent(value // WithIntervalSeconds sets the IntervalSeconds field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the IntervalSeconds field is set to the value of the last call. -func (b *IncrementalUpgradeOptionsApplyConfiguration) WithIntervalSeconds(value int32) *IncrementalUpgradeOptionsApplyConfiguration { +func (b *ClusterUpgradeOptionsApplyConfiguration) WithIntervalSeconds(value int32) *ClusterUpgradeOptionsApplyConfiguration { b.IntervalSeconds = &value return b } @@ -44,7 +44,7 @@ func (b *IncrementalUpgradeOptionsApplyConfiguration) WithIntervalSeconds(value // WithGatewayClassName sets the GatewayClassName field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the GatewayClassName field is set to the value of the last call. -func (b *IncrementalUpgradeOptionsApplyConfiguration) WithGatewayClassName(value string) *IncrementalUpgradeOptionsApplyConfiguration { +func (b *ClusterUpgradeOptionsApplyConfiguration) WithGatewayClassName(value string) *ClusterUpgradeOptionsApplyConfiguration { b.GatewayClassName = &value return b } diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go index 0a190883bff..c8cfc02aed6 100644 --- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go +++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go @@ -9,8 +9,8 @@ import ( // RayServiceUpgradeStrategyApplyConfiguration represents a declarative configuration of the RayServiceUpgradeStrategy type for use // with apply. type RayServiceUpgradeStrategyApplyConfiguration struct { - Type *rayv1.RayServiceUpgradeType `json:"type,omitempty"` - IncrementalUpgradeOptions *IncrementalUpgradeOptionsApplyConfiguration `json:"incrementalUpgradeOptions,omitempty"` + Type *rayv1.RayServiceUpgradeType `json:"type,omitempty"` + ClusterUpgradeOptions *ClusterUpgradeOptionsApplyConfiguration `json:"clusterUpgradeOptions,omitempty"` } // RayServiceUpgradeStrategyApplyConfiguration constructs a declarative configuration of the RayServiceUpgradeStrategy type for use with @@ -27,10 +27,10 @@ func (b *RayServiceUpgradeStrategyApplyConfiguration) WithType(value rayv1.RaySe return b } -// WithIncrementalUpgradeOptions sets the IncrementalUpgradeOptions field in the declarative configuration to the given value +// WithClusterUpgradeOptions sets the ClusterUpgradeOptions field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the IncrementalUpgradeOptions field is set to the value of the last call. -func (b *RayServiceUpgradeStrategyApplyConfiguration) WithIncrementalUpgradeOptions(value *IncrementalUpgradeOptionsApplyConfiguration) *RayServiceUpgradeStrategyApplyConfiguration { - b.IncrementalUpgradeOptions = value +// If called multiple times, the ClusterUpgradeOptions field is set to the value of the last call. +func (b *RayServiceUpgradeStrategyApplyConfiguration) WithClusterUpgradeOptions(value *ClusterUpgradeOptionsApplyConfiguration) *RayServiceUpgradeStrategyApplyConfiguration { + b.ClusterUpgradeOptions = value return b } diff --git a/ray-operator/pkg/client/applyconfiguration/utils.go b/ray-operator/pkg/client/applyconfiguration/utils.go index 62ba25bb755..050733b0c5e 100644 --- a/ray-operator/pkg/client/applyconfiguration/utils.go +++ b/ray-operator/pkg/client/applyconfiguration/utils.go @@ -34,8 +34,6 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &rayv1.HeadGroupSpecApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("HeadInfo"): return &rayv1.HeadInfoApplyConfiguration{} - case v1.SchemeGroupVersion.WithKind("IncrementalUpgradeOptions"): - return &rayv1.IncrementalUpgradeOptionsApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("RayCluster"): return &rayv1.RayClusterApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("RayClusterSpec"): diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go index 58e26146831..b7b0bdb924e 100644 --- a/ray-operator/test/e2eincrementalupgrade/support.go +++ b/ray-operator/test/e2eincrementalupgrade/support.go @@ -46,8 +46,8 @@ func IncrementalUpgradeRayServiceApplyConfiguration( return rayv1ac.RayServiceSpec(). WithUpgradeStrategy(rayv1ac.RayServiceUpgradeStrategy(). WithType(rayv1.IncrementalUpgrade). - WithIncrementalUpgradeOptions( - rayv1ac.IncrementalUpgradeOptions(). + WithClusterUpgradeOptions( + rayv1ac.ClusterUpgradeOptions(). WithGatewayClassName("istio"). WithStepSizePercent(*stepSizePercent). WithIntervalSeconds(*intervalSeconds). From e380bcc8236ef426f9c1634c21babf1a00b6dde1 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Fri, 17 Oct 2025 23:54:23 -0700 Subject: [PATCH 47/56] Update ray-operator/controllers/ray/common/service.go Co-authored-by: Rueian Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/common/service.go | 1 - 1 file changed, 1 deletion(-) diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go index 128e0c684ff..b90092806ca 100644 --- a/ray-operator/controllers/ray/common/service.go +++ b/ray-operator/controllers/ray/common/service.go @@ -348,7 +348,6 @@ func GetGatewayListenersForRayService(rayServiceInstance *rayv1.RayService) []gw Name: gwv1.SectionName(utils.GatewayListenerPortName), Protocol: gwv1.HTTPProtocolType, Port: utils.DefaultGatewayListenerPort, - Hostname: (*gwv1.Hostname)(&hostname), // backwards compatibility with Serve service }, } } From 47369902330e6139c8a0b157be5e43711184f515 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 16 Oct 2025 09:12:34 +0000 Subject: [PATCH 48/56] Remove hostname from listener Signed-off-by: Ryan O'Leary --- .../controllers/ray/common/service.go | 13 ---------- .../controllers/ray/common/service_test.go | 25 ------------------- .../controllers/ray/rayservice_controller.go | 9 +++++-- .../rayservice_incremental_upgrade_test.go | 10 +++----- .../test/e2eincrementalupgrade/support.go | 2 -- 5 files changed, 11 insertions(+), 48 deletions(-) diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go index b90092806ca..6655f0b3a29 100644 --- a/ray-operator/controllers/ray/common/service.go +++ b/ray-operator/controllers/ray/common/service.go @@ -339,19 +339,6 @@ func GetServePort(cluster *rayv1.RayCluster) gwv1.PortNumber { return gwv1.PortNumber(utils.DefaultServingPort) } -// GetGatewayListenersForRayService constructs the default HTTP listener for a RayService Gateway. -func GetGatewayListenersForRayService(rayServiceInstance *rayv1.RayService) []gwv1.Listener { - hostname := fmt.Sprintf("%s.%s.svc.cluster.local", rayServiceInstance.Name, rayServiceInstance.Namespace) - - return []gwv1.Listener{ - { - Name: gwv1.SectionName(utils.GatewayListenerPortName), - Protocol: gwv1.HTTPProtocolType, - Port: utils.DefaultGatewayListenerPort, - }, - } -} - func setServiceTypeForUserProvidedService(ctx context.Context, service *corev1.Service, defaultType corev1.ServiceType) { log := ctrl.LoggerFrom(ctx) // If the user has not specified a service type, use the default service type diff --git a/ray-operator/controllers/ray/common/service_test.go b/ray-operator/controllers/ray/common/service_test.go index dd552f380b4..bda8588a058 100644 --- a/ray-operator/controllers/ray/common/service_test.go +++ b/ray-operator/controllers/ray/common/service_test.go @@ -11,7 +11,6 @@ import ( "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - gwv1 "sigs.k8s.io/gateway-api/apis/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" @@ -602,30 +601,6 @@ func TestUserSpecifiedServeService(t *testing.T) { validateNameAndNamespaceForUserSpecifiedService(svc, testRayServiceWithServeService.ObjectMeta.Namespace, userName, t) } -func TestGetGatewayListenersForRayService(t *testing.T) { - rayService := &rayv1.RayService{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-ray-service", - Namespace: "test-ns", - }, - } - - listeners := GetGatewayListenersForRayService(rayService) - - // Validate expected Gateway HTTP listener is created. - require.Len(t, listeners, 1) - listener := listeners[0] - - assert.Equal(t, gwv1.SectionName(utils.GatewayListenerPortName), listener.Name) - assert.Equal(t, gwv1.HTTPProtocolType, listener.Protocol) - assert.Equal(t, gwv1.PortNumber(utils.DefaultGatewayListenerPort), listener.Port) - - // Verify hostname is created for compatibility with standard RayService Serve service endpoint. - expectedHostname := fmt.Sprintf("%s.%s.svc.cluster.local", rayService.Name, rayService.Namespace) - require.NotNil(t, listener.Hostname) - assert.Equal(t, expectedHostname, string(*listener.Hostname)) -} - func validateServiceTypeForUserSpecifiedService(svc *corev1.Service, userType corev1.ServiceType, t *testing.T) { // Test that the user service type takes priority over the default service type (example: ClusterIP) if svc.Spec.Type != userType { diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 3739441d32c..dc2d37099f9 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -574,11 +574,16 @@ func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayServic }, Spec: gwv1.GatewaySpec{ GatewayClassName: gwv1.ObjectName(options.GatewayClassName), + Listeners: []gwv1.Listener{ + { + Name: gwv1.SectionName(utils.GatewayListenerPortName), + Protocol: gwv1.HTTPProtocolType, + Port: utils.DefaultGatewayListenerPort, + }, + }, }, } - rayServiceGateway.Spec.Listeners = common.GetGatewayListenersForRayService(rayServiceInstance) - return rayServiceGateway, nil } diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go index f1dbefdeb3a..78a8bd826da 100644 --- a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -95,12 +95,10 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { gatewayIP := GetGatewayIP(gateway) g.Expect(gatewayIP).NotTo(BeEmpty()) - hostname := fmt.Sprintf("%s.%s.svc.cluster.local", rayService.Name, rayService.Namespace) - LogWithTimestamp(test.T(), "Verifying RayService is serving traffic") - stdout, _ := CurlRayServiceGateway(test, gatewayIP, hostname, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) g.Expect(stdout.String()).To(Equal("6")) - stdout, _ = CurlRayServiceGateway(test, gatewayIP, hostname, curlPod, curlContainerName, "/calc", `["MUL", 3]`) + stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/calc", `["MUL", 3]`) g.Expect(stdout.String()).To(Equal("15 pizzas please!")) // Attempt to trigger incremental upgrade by updating RayService serve config and RayCluster spec @@ -180,7 +178,7 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { // Send a request to the RayService to validate no requests are dropped. Check that // both endpoints are serving requests. - stdout, _ := CurlRayServiceGateway(test, gatewayIP, hostname, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) response := stdout.String() g.Expect(response).To(Or(Equal("6"), Equal("8")), "Response should be from the old or new app version during the upgrade") if response == "6" { @@ -215,6 +213,6 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeFalse())) LogWithTimestamp(test.T(), "Verifying RayService uses updated ServeConfig after upgrade completes") - stdout, _ = CurlRayServiceGateway(test, gatewayIP, hostname, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) + stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`) g.Expect(stdout.String()).To(Equal("8")) } diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go index b7b0bdb924e..b036c06e67c 100644 --- a/ray-operator/test/e2eincrementalupgrade/support.go +++ b/ray-operator/test/e2eincrementalupgrade/support.go @@ -20,7 +20,6 @@ import ( func CurlRayServiceGateway( t Test, gatewayIP string, - hostname string, curlPod *corev1.Pod, curlPodContainerName, rayServicePath, @@ -32,7 +31,6 @@ func CurlRayServiceGateway( "-X", "POST", "-H", "Connection: close", // avoid re-using the same connection for test "-H", "Content-Type: application/json", - "-H", fmt.Sprintf("Host: %s", hostname), fmt.Sprintf("http://%s%s", gatewayIP, rayServicePath), "-d", body, } From 638cbea0a1f842b49b4c85a306629c6066f67a32 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 20 Oct 2025 11:09:27 +0000 Subject: [PATCH 49/56] ensure pending cluster scales from 0 target_capacity Signed-off-by: Ryan O'Leary --- .../controllers/ray/rayservice_controller.go | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index dc2d37099f9..72bcac35b77 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -1151,6 +1151,17 @@ func constructRayClusterForRayService(rayService *rayv1.RayService, rayClusterNa // set the KubeRay version used to create the RayCluster rayClusterAnnotations[utils.KubeRayVersion] = utils.KUBERAY_VERSION + clusterSpec := rayService.Spec.RayClusterSpec.DeepCopy() + isPendingClusterForUpgrade := utils.IsIncrementalUpgradeEnabled(&rayService.Spec) && + rayService.Status.ActiveServiceStatus.RayClusterName != "" + if isPendingClusterForUpgrade { + // For incremental upgrade, start the pending cluster without a replicas value so + // that it autoscales based on the value of target_capacity from MinReplicas. + for i := range clusterSpec.WorkerGroupSpecs { + clusterSpec.WorkerGroupSpecs[i].Replicas = nil + } + } + rayCluster := &rayv1.RayCluster{ ObjectMeta: metav1.ObjectMeta{ Labels: rayClusterLabel, @@ -1158,7 +1169,7 @@ func constructRayClusterForRayService(rayService *rayv1.RayService, rayClusterNa Name: rayClusterName, Namespace: rayService.Namespace, }, - Spec: rayService.Spec.RayClusterSpec, + Spec: *clusterSpec, } // Set the ownership in order to do the garbage collection by k8s. @@ -1199,6 +1210,24 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer return err } + if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { + // For incremental upgrades, set target_capacity if specified to avoid + // scaling initial Serve deployment to 100% immediately. + var targetCapacity *int32 + activeStatus := rayServiceInstance.Status.ActiveServiceStatus + pendingStatus := rayServiceInstance.Status.PendingServiceStatus + + if clusterName == activeStatus.RayClusterName && activeStatus.TargetCapacity != nil { + targetCapacity = activeStatus.TargetCapacity + } else if clusterName == pendingStatus.RayClusterName && pendingStatus.TargetCapacity != nil { + targetCapacity = pendingStatus.TargetCapacity + } + if targetCapacity != nil { + logger.Info("Setting target_capacity from status in Serve config.", "target_capacity", *targetCapacity) + serveConfig["target_capacity"] = *targetCapacity + } + } + configJson, err := json.Marshal(serveConfig) if err != nil { return fmt.Errorf("failed to marshal converted serve config into bytes: %w", err) From 7f88d2fc9c5567de7bb9079ef1c3492f75f2b554 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 20 Oct 2025 11:44:25 +0000 Subject: [PATCH 50/56] Run make generate after rebase Signed-off-by: Ryan O'Leary --- docs/reference/api.md | 19 ++++++++++++ .../apis/ray/v1/zz_generated.deepcopy.go | 30 +++++++++++++++++++ .../pkg/client/applyconfiguration/utils.go | 2 ++ 3 files changed, 51 insertions(+) diff --git a/docs/reference/api.md b/docs/reference/api.md index d2eca7d97e9..d5c38141efd 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -55,6 +55,25 @@ _Appears in:_ +#### ClusterUpgradeOptions + + + +These options are currently only supported for the IncrementalUpgrade type. + + + +_Appears in:_ +- [RayServiceUpgradeStrategy](#rayserviceupgradestrategy) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `maxSurgePercent` _integer_ | The capacity of serve requests the upgraded cluster should scale to handle each interval.
Defaults to 100%. | 100 | | +| `stepSizePercent` _integer_ | The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent. | | | +| `intervalSeconds` _integer_ | The interval in seconds between transferring StepSize traffic from the old to new RayCluster. | | | +| `gatewayClassName` _string_ | The name of the Gateway Class installed by the Kubernetes Cluster admin. | | | + + #### DeletionCondition diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go index 5ed52746d85..8deb750000c 100644 --- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go +++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go @@ -103,6 +103,36 @@ func (in *AutoscalerOptions) DeepCopy() *AutoscalerOptions { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterUpgradeOptions) DeepCopyInto(out *ClusterUpgradeOptions) { + *out = *in + if in.MaxSurgePercent != nil { + in, out := &in.MaxSurgePercent, &out.MaxSurgePercent + *out = new(int32) + **out = **in + } + if in.StepSizePercent != nil { + in, out := &in.StepSizePercent, &out.StepSizePercent + *out = new(int32) + **out = **in + } + if in.IntervalSeconds != nil { + in, out := &in.IntervalSeconds, &out.IntervalSeconds + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterUpgradeOptions. +func (in *ClusterUpgradeOptions) DeepCopy() *ClusterUpgradeOptions { + if in == nil { + return nil + } + out := new(ClusterUpgradeOptions) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DeletionCondition) DeepCopyInto(out *DeletionCondition) { *out = *in diff --git a/ray-operator/pkg/client/applyconfiguration/utils.go b/ray-operator/pkg/client/applyconfiguration/utils.go index 050733b0c5e..feecbde7f06 100644 --- a/ray-operator/pkg/client/applyconfiguration/utils.go +++ b/ray-operator/pkg/client/applyconfiguration/utils.go @@ -20,6 +20,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} { return &rayv1.AppStatusApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("AutoscalerOptions"): return &rayv1.AutoscalerOptionsApplyConfiguration{} + case v1.SchemeGroupVersion.WithKind("ClusterUpgradeOptions"): + return &rayv1.ClusterUpgradeOptionsApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("DeletionCondition"): return &rayv1.DeletionConditionApplyConfiguration{} case v1.SchemeGroupVersion.WithKind("DeletionPolicy"): From b61b91feaa7230c737cfab3d9bdd1219c62ea553 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 20 Oct 2025 21:42:50 +0000 Subject: [PATCH 51/56] rename upgrade type Signed-off-by: Ryan O'Leary --- docs/reference/api.md | 2 +- ray-operator/apis/ray/v1/rayservice_types.go | 6 +-- .../controllers/ray/rayservice_controller.go | 41 ++++++++++--------- .../ray/rayservice_controller_unit_test.go | 26 ++++++------ ray-operator/controllers/ray/utils/util.go | 2 +- .../controllers/ray/utils/util_test.go | 8 ++-- .../controllers/ray/utils/validation.go | 14 +++---- .../controllers/ray/utils/validation_test.go | 4 +- ray-operator/pkg/features/features.go | 2 +- .../rayservice_incremental_upgrade_test.go | 2 +- .../test/e2eincrementalupgrade/support.go | 2 +- 11 files changed, 55 insertions(+), 54 deletions(-) diff --git a/docs/reference/api.md b/docs/reference/api.md index d5c38141efd..dc621718f0a 100644 --- a/docs/reference/api.md +++ b/docs/reference/api.md @@ -396,7 +396,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | | -| `clusterUpgradeOptions` _[ClusterUpgradeOptions](#clusterupgradeoptions)_ | ClusterUpgradeOptions defines the behavior of an IncrementalUpgrade.
RayServiceIncrementalUpgrade feature gate must be enabled to set ClusterUpgradeOptions. | | | +| `clusterUpgradeOptions` _[ClusterUpgradeOptions](#clusterupgradeoptions)_ | ClusterUpgradeOptions defines the behavior of a NewClusterWithIncrementalUpgrade type.
RayServiceIncrementalUpgrade feature gate must be enabled to set ClusterUpgradeOptions. | | | #### RayServiceUpgradeType diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index 9c599a882c3..15e9d6e895d 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -22,9 +22,9 @@ const ( type RayServiceUpgradeType string const ( - // During upgrade, IncrementalUpgrade strategy will create an upgraded cluster to gradually scale + // During upgrade, NewClusterWithIncrementalUpgrade strategy will create an upgraded cluster to gradually scale // and migrate traffic to using Gateway API. - IncrementalUpgrade RayServiceUpgradeType = "IncrementalUpgrade" + NewClusterWithIncrementalUpgrade RayServiceUpgradeType = "NewClusterWithIncrementalUpgrade" // During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready NewCluster RayServiceUpgradeType = "NewCluster" // No new cluster will be created while the strategy is set to None @@ -78,7 +78,7 @@ type RayServiceUpgradeStrategy struct { // Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. // +optional Type *RayServiceUpgradeType `json:"type,omitempty"` - // ClusterUpgradeOptions defines the behavior of an IncrementalUpgrade. + // ClusterUpgradeOptions defines the behavior of a NewClusterWithIncrementalUpgrade type. // RayServiceIncrementalUpgrade feature gate must be enabled to set ClusterUpgradeOptions. ClusterUpgradeOptions *ClusterUpgradeOptions `json:"clusterUpgradeOptions,omitempty"` } diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 72bcac35b77..b85943c629b 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -151,8 +151,8 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // 1. If there is a pending cluster, reconcile serve applications for the pending cluster. // 2. If there are both active and pending clusters, reconcile serve applications for the pending cluster only. // 3. If there is no pending cluster, reconcile serve applications for the active cluster. - // 4. During an IncrementalUpgrade, reconcileServe will reconcile either the pending or active cluster based - // on total TargetCapacity. + // 4. During NewClusterWithIncrementalUpgrade, reconcileServe will reconcile either the pending or active cluster + // based on total TargetCapacity. var isActiveClusterReady, isPendingClusterReady bool = false, false var activeClusterServeApplications, pendingClusterServeApplications map[string]rayv1.AppStatus = nil, nil if pendingRayClusterInstance != nil { @@ -170,13 +170,13 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } } else if activeRayClusterInstance != nil && pendingRayClusterInstance != nil && utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { - logger.Info("Reconciling the Serve applications for active cluster during IncrementalUpgrade", "clusterName", activeRayClusterInstance.Name) + logger.Info("Reconciling the Serve applications for active cluster during NewClusterWithIncrementalUpgrade", "clusterName", activeRayClusterInstance.Name) if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } } - // Check if IncrementalUpgrade is enabled, if so reconcile Gateway objects. + // Check if NewClusterWithIncrementalUpgrade is enabled, if so reconcile Gateway objects. if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { // Ensure per-cluster Serve service exists for the active and pending RayClusters. if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { @@ -347,7 +347,7 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn var isPendingClusterServing bool if headSvc != nil && serveSvc != nil { if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { - logger.Info("Processing incremental upgrade strategy.", "rayService", rayServiceInstance.Name) + logger.Info("Processing NewClusterWithIncrementalUpgrade strategy.", "rayService", rayServiceInstance.Name) oldActivePercent := ptr.Deref(rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent, -1) oldPendingPercent := ptr.Deref(rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent, -1) @@ -424,7 +424,7 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn numServeEndpoints += len(subset.Addresses) } - // During an IncrementalUpgrade, the pending RayCluster is also serving. + // During NewClusterWithIncrementalUpgrade, the pending RayCluster is also serving. if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && pendingCluster != nil { pendingServeServiceName := common.RayClusterServeServiceNamespacedName(pendingCluster) if err := r.Get(ctx, pendingServeServiceName, serveEndPoints); err != nil && !errors.IsNotFound(err) { @@ -539,8 +539,8 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra upgradeType := upgradeStrategy.Type if upgradeType != nil { if features.Enabled(features.RayServiceIncrementalUpgrade) { - if *upgradeType != rayv1.NewCluster && *upgradeType != rayv1.IncrementalUpgrade { - logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to %s or %s.", string(rayv1.NewCluster), string(rayv1.IncrementalUpgrade)) + if *upgradeType != rayv1.NewCluster && *upgradeType != rayv1.NewClusterWithIncrementalUpgrade { + logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to %s or %s.", string(rayv1.NewCluster), string(rayv1.NewClusterWithIncrementalUpgrade)) return false } } else if *upgradeType != rayv1.NewCluster { @@ -793,7 +793,7 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn return desiredHTTPRoute, nil } -// reconcileHTTPRoute reconciles a HTTPRoute resource for a RayService to route traffic during an IncrementalUpgrade. +// reconcileHTTPRoute reconciles a HTTPRoute resource for a RayService to route traffic during a NewClusterWithIncrementalUpgrade. func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService, isPendingClusterReady bool) error { logger := ctrl.LoggerFrom(ctx) var err error @@ -1248,7 +1248,7 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer } // checkIfNeedTargetCapacityUpdate returns whether the controller should adjust the target_capacity -// of the Serve config associated with a RayCluster during an IncrementalUpgrade. +// of the Serve config associated with a RayCluster during NewClusterWithIncrementalUpgrade. // // This function implements the incremental upgrade state machine as defined in the design document: // https://github.com/ray-project/enhancements/blob/main/reps/2024-12-4-ray-service-incr-upgrade.md @@ -1268,7 +1268,8 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer // // 4. Phase 4 (Step 11): Upgrade completion // - Both clusters reach final state: active=0%, pending=100% -// - Returns false: "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete." +// - Returns false: "All traffic has migrated to the upgraded cluster and NewClusterWithIncrementalUpgrade migration +// is complete." // // The function ensures that traffic migration only proceeds when the target cluster has reached // its capacity limit, preventing resource conflicts and ensuring upgrade stability. @@ -1277,7 +1278,7 @@ func (r *RayServiceReconciler) checkIfNeedTargetCapacityUpdate(ctx context.Conte pendingRayServiceStatus := rayServiceInstance.Status.PendingServiceStatus if activeRayServiceStatus.RayClusterName == "" || pendingRayServiceStatus.RayClusterName == "" { - return false, "Both active and pending RayCluster instances are required for incremental upgrade." + return false, "Both active and pending RayCluster instances are required for NewClusterWithIncrementalUpgrade." } // Validate Gateway and HTTPRoute objects are ready @@ -1286,7 +1287,7 @@ func (r *RayServiceReconciler) checkIfNeedTargetCapacityUpdate(ctx context.Conte return false, fmt.Sprintf("Failed to retrieve Gateway for RayService: %v", err) } if !utils.IsGatewayReady(gatewayInstance) { - return false, "Gateway for RayService IncrementalUpgrade is not ready." + return false, "Gateway for RayService NewClusterWithIncrementalUpgrade is not ready." } httpRouteInstance := &gwv1.HTTPRoute{} @@ -1294,10 +1295,10 @@ func (r *RayServiceReconciler) checkIfNeedTargetCapacityUpdate(ctx context.Conte return false, fmt.Sprintf("Failed to retrieve HTTPRoute for RayService: %v", err) } if !utils.IsHTTPRouteReady(gatewayInstance, httpRouteInstance) { - return false, "HTTPRoute for RayService IncrementalUpgrade is not ready." + return false, "HTTPRoute for RayService NewClusterWithIncrementalUpgrade is not ready." } - // Retrieve the current observed IncrementalUpgrade Status fields for each RayService. + // Retrieve the current observed NewClusterWithIncrementalUpgrade Status fields for each RayService. if activeRayServiceStatus.TargetCapacity == nil || activeRayServiceStatus.TrafficRoutedPercent == nil { return true, "Active RayServiceStatus missing TargetCapacity or TrafficRoutedPercent." } @@ -1309,7 +1310,7 @@ func (r *RayServiceReconciler) checkIfNeedTargetCapacityUpdate(ctx context.Conte pendingTrafficRoutedPercent := int(*pendingRayServiceStatus.TrafficRoutedPercent) if activeTargetCapacity == 0 && pendingTargetCapacity == 100 { - return false, "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete." + return false, "All traffic has migrated to the upgraded cluster and NewClusterWithIncrementalUpgrade is complete." } else if pendingTargetCapacity < 100 || pendingTrafficRoutedPercent < 100 { return true, "Pending RayCluster has not finished scaling up." } @@ -1368,7 +1369,7 @@ func (r *RayServiceReconciler) applyServeTargetCapacity(ctx context.Context, ray } // reconcileServeTargetCapacity reconciles the target_capacity of the ServeConfig for a given RayCluster during -// an IncrementalUpgrade while also updating the Status.TargetCapacity of the Active and Pending RayServices. +// a NewClusterWithIncrementalUpgrade while also updating the Status.TargetCapacity of the Active and Pending RayServices. func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient dashboardclient.RayDashboardClientInterface) error { logger := ctrl.LoggerFrom(ctx) logger.Info("reconcileServeTargetCapacity", "RayService", rayServiceInstance.Name) @@ -1384,7 +1385,7 @@ func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, pendingRayServiceStatus.TargetCapacity = ptr.To(int32(0)) } - // Retrieve the current observed Status fields for IncrementalUpgrade + // Retrieve the current observed Status fields for NewClusterWithIncrementalUpgrade activeTargetCapacity := *activeRayServiceStatus.TargetCapacity pendingTargetCapacity := *pendingRayServiceStatus.TargetCapacity pendingTrafficRoutedPercent := ptr.Deref(pendingRayServiceStatus.TrafficRoutedPercent, 0) @@ -1623,10 +1624,10 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) if isActiveCluster && isIncrementalUpgradeInProgress { - // Skip updating the Serve config for the Active cluster during IncrementalUpgrade. The updated + // Skip updating the Serve config for the Active cluster during NewClusterWithIncrementalUpgrade. The updated // Serve config is applied to the pending RayService's RayCluster. skipConfigUpdate = true - logger.Info("Blocking new Serve config submission for Active cluster during IncrementalUpgrade.", "clusterName", rayClusterInstance.Name) + logger.Info("Blocking new Serve config submission for Active cluster during upgrade.", "clusterName", rayClusterInstance.Name) } cachedServeConfigV2 := r.getServeConfigFromCache(rayServiceInstance, rayClusterInstance.Name) diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index bba28480d1a..dce09f5f424 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1351,7 +1351,7 @@ func makeIncrementalUpgradeRayService( } if withOptions { spec.UpgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ - Type: ptr.To(rayv1.IncrementalUpgrade), + Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade), ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{ GatewayClassName: gatewayClassName, StepSizePercent: stepSizePercent, @@ -1466,7 +1466,7 @@ func TestCreateHTTPRoute(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace}, Spec: rayv1.RayServiceSpec{ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ - Type: ptr.To(rayv1.IncrementalUpgrade), + Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade), ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{ StepSizePercent: &stepSize, IntervalSeconds: &interval, @@ -1498,7 +1498,7 @@ func TestCreateHTTPRoute(t *testing.T) { isPendingClusterReady bool }{ { - name: "Incremental upgrade, but pending cluster is not ready, so no traffic shift.", + name: "NewClusterWithIncrementalUpgrade, but pending cluster is not ready, so no traffic shift.", modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} }, @@ -1508,7 +1508,7 @@ func TestCreateHTTPRoute(t *testing.T) { expectedPendingWeight: 0, }, { - name: "Incremental upgrade, time since LastTrafficMigratedTime < IntervalSeconds.", + name: "NewClusterWithIncrementalUpgrade, time since LastTrafficMigratedTime < IntervalSeconds.", modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} }, @@ -1518,7 +1518,7 @@ func TestCreateHTTPRoute(t *testing.T) { expectedPendingWeight: 0, }, { - name: "Incremental upgrade, time since LastTrafficMigratedTime >= IntervalSeconds.", + name: "NewClusterWithIncrementalUpgrade, time since LastTrafficMigratedTime >= IntervalSeconds.", modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(60)) @@ -1529,7 +1529,7 @@ func TestCreateHTTPRoute(t *testing.T) { expectedPendingWeight: 10, }, { - name: "Incremental upgrade, TrafficRoutedPercent capped to pending TargetCapacity.", + name: "NewClusterWithIncrementalUpgrade, TrafficRoutedPercent capped to pending TargetCapacity.", modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)} rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(5)) @@ -1631,7 +1631,7 @@ func TestReconcileHTTPRoute(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace}, Spec: rayv1.RayServiceSpec{ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ - Type: ptr.To(rayv1.IncrementalUpgrade), + Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade), ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{ StepSizePercent: &stepSize, IntervalSeconds: &interval, @@ -1874,7 +1874,7 @@ func TestReconcileServeTargetCapacity(t *testing.T) { rayService := &rayv1.RayService{ Spec: rayv1.RayServiceSpec{ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ - Type: ptr.To(rayv1.IncrementalUpgrade), + Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade), ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{ MaxSurgePercent: ptr.To(tt.maxSurgePercent), }, @@ -2005,7 +2005,7 @@ func TestCheckIfNeedTargetCapacityUpdate(t *testing.T) { { name: "Missing RayClusterNames", expectedNeedsUpdate: false, - expectedReason: "Both active and pending RayCluster instances are required for incremental upgrade.", + expectedReason: "Both active and pending RayCluster instances are required for NewClusterWithIncrementalUpgrade.", }, { name: "Gateway not ready", @@ -2015,7 +2015,7 @@ func TestCheckIfNeedTargetCapacityUpdate(t *testing.T) { makeGateway(gatewayName, namespace, false), makeHTTPRoute(httpRouteName, namespace, true), }, expectedNeedsUpdate: false, - expectedReason: "Gateway for RayService IncrementalUpgrade is not ready.", + expectedReason: "Gateway for RayService NewClusterWithIncrementalUpgrade is not ready.", }, { name: "HTTPRoute not ready", @@ -2025,10 +2025,10 @@ func TestCheckIfNeedTargetCapacityUpdate(t *testing.T) { makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, false), }, expectedNeedsUpdate: false, - expectedReason: "HTTPRoute for RayService IncrementalUpgrade is not ready.", + expectedReason: "HTTPRoute for RayService NewClusterWithIncrementalUpgrade is not ready.", }, { - name: "Incremental upgrade is complete", + name: "NewClusterWithIncrementalUpgrade is complete", activeStatus: rayv1.RayServiceStatus{ RayClusterName: "active", TargetCapacity: ptr.To(int32(0)), @@ -2043,7 +2043,7 @@ func TestCheckIfNeedTargetCapacityUpdate(t *testing.T) { makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, true), }, expectedNeedsUpdate: false, - expectedReason: "All traffic has migrated to the upgraded cluster and IncrementalUpgrade is complete.", + expectedReason: "All traffic has migrated to the upgraded cluster and NewClusterWithIncrementalUpgrade is complete.", }, { name: "Pending RayCluster is still incrementally scaling", diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 98f4c726736..e1f2b59062d 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -771,7 +771,7 @@ func IsIncrementalUpgradeEnabled(spec *rayv1.RayServiceSpec) bool { return false } return spec != nil && spec.UpgradeStrategy != nil && - *spec.UpgradeStrategy.Type == rayv1.IncrementalUpgrade + *spec.UpgradeStrategy.Type == rayv1.NewClusterWithIncrementalUpgrade } func GetRayServiceClusterUpgradeOptions(spec *rayv1.RayServiceSpec) *rayv1.ClusterUpgradeOptions { diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index b50268b534e..a4bb526f8df 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -1413,20 +1413,20 @@ func TestIsIncrementalUpgradeEnabled(t *testing.T) { expected: false, }, { - name: "UpgradeStrategy Type is IncrementalUpgrade but feature disabled", + name: "UpgradeStrategy Type is NewClusterWithIncrementalUpgrade but feature disabled", spec: &rayv1.RayServiceSpec{ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ - Type: ptr.To(rayv1.IncrementalUpgrade), + Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade), }, }, featureEnabled: false, expected: false, }, { - name: "UpgradeStrategy Type is IncrementalUpgrade and feature enabled", + name: "UpgradeStrategy Type is NewClusterWithIncrementalUpgrade and feature enabled", spec: &rayv1.RayServiceSpec{ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{ - Type: ptr.To(rayv1.IncrementalUpgrade), + Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade), }, }, featureEnabled: true, diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 5a419c22a23..e03bbd319b8 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -306,13 +306,13 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { return fmt.Errorf("spec.rayClusterConfig.headGroupSpec.headService.metadata.name should not be set") } - // only IncrementalUpgrade, NewCluster, and None are valid upgradeType + // only NewClusterWithIncrementalUpgrade, NewCluster, and None are valid upgradeType if rayService.Spec.UpgradeStrategy != nil && rayService.Spec.UpgradeStrategy.Type != nil && *rayService.Spec.UpgradeStrategy.Type != rayv1.None && *rayService.Spec.UpgradeStrategy.Type != rayv1.NewCluster && - *rayService.Spec.UpgradeStrategy.Type != rayv1.IncrementalUpgrade { - return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s, %s, or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.IncrementalUpgrade, rayv1.NewCluster, rayv1.None) + *rayService.Spec.UpgradeStrategy.Type != rayv1.NewClusterWithIncrementalUpgrade { + return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s, %s, or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.NewClusterWithIncrementalUpgrade, rayv1.NewCluster, rayv1.None) } if rayService.Spec.RayClusterDeletionDelaySeconds != nil && @@ -320,7 +320,7 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { return fmt.Errorf("Spec.RayClusterDeletionDelaySeconds should be a non-negative integer, got %d", *rayService.Spec.RayClusterDeletionDelaySeconds) } - // If type is IncrementalUpgrade, validate the ClusterUpgradeOptions + // If type is NewClusterWithIncrementalUpgrade, validate the ClusterUpgradeOptions if IsIncrementalUpgradeEnabled(&rayService.Spec) { return ValidateClusterUpgradeOptions(rayService) } @@ -330,12 +330,12 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error { func ValidateClusterUpgradeOptions(rayService *rayv1.RayService) error { if !IsAutoscalingEnabled(&rayService.Spec.RayClusterSpec) { - return fmt.Errorf("Ray Autoscaler is required for IncrementalUpgrade") + return fmt.Errorf("Ray Autoscaler is required for NewClusterWithIncrementalUpgrade") } options := rayService.Spec.UpgradeStrategy.ClusterUpgradeOptions if options == nil { - return fmt.Errorf("ClusterUpgradeOptions are required for IncrementalUpgrade") + return fmt.Errorf("ClusterUpgradeOptions are required for NewClusterWithIncrementalUpgrade") } // MaxSurgePercent defaults to 100% if unset. @@ -352,7 +352,7 @@ func ValidateClusterUpgradeOptions(rayService *rayv1.RayService) error { } if options.GatewayClassName == "" { - return fmt.Errorf("gatewayClassName is required for IncrementalUpgrade") + return fmt.Errorf("gatewayClassName is required for NewClusterWithIncrementalUpgrade") } return nil diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index bd47fd773b9..2f3f7a64502 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1740,7 +1740,7 @@ func TestValidateClusterUpgradeOptions(t *testing.T) { var upgradeStrategy *rayv1.RayServiceUpgradeStrategy if tt.maxSurgePercent != nil || tt.stepSizePercent != nil || tt.intervalSeconds != nil || tt.gatewayClassName != "" { upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ - Type: ptr.To(rayv1.IncrementalUpgrade), + Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade), ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{ MaxSurgePercent: tt.maxSurgePercent, StepSizePercent: tt.stepSizePercent, @@ -1750,7 +1750,7 @@ func TestValidateClusterUpgradeOptions(t *testing.T) { } } else if tt.expectError { upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{ - Type: ptr.To(rayv1.IncrementalUpgrade), + Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade), } } diff --git a/ray-operator/pkg/features/features.go b/ray-operator/pkg/features/features.go index ce5734cee0a..e3ed02c75a3 100644 --- a/ray-operator/pkg/features/features.go +++ b/ray-operator/pkg/features/features.go @@ -29,7 +29,7 @@ const ( // rep: N/A // alpha: v1.0 // - // Enabled incremental upgrades for RayService zero-downtime upgrades. + // Enabled NewClusterWithIncrementalUpgrade type for RayService zero-downtime upgrades. RayServiceIncrementalUpgrade featuregate.Feature = "RayServiceIncrementalUpgrade" ) diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go index 78a8bd826da..f20ea2422a2 100644 --- a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -101,7 +101,7 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/calc", `["MUL", 3]`) g.Expect(stdout.String()).To(Equal("15 pizzas please!")) - // Attempt to trigger incremental upgrade by updating RayService serve config and RayCluster spec + // Attempt to trigger NewClusterWithIncrementalUpgrade by updating RayService serve config and RayCluster spec g.Eventually(func() error { latestRayService, err := GetRayService(test, namespace.Name, rayServiceName) if err != nil { diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go index b036c06e67c..b5e6293f491 100644 --- a/ray-operator/test/e2eincrementalupgrade/support.go +++ b/ray-operator/test/e2eincrementalupgrade/support.go @@ -43,7 +43,7 @@ func IncrementalUpgradeRayServiceApplyConfiguration( ) *rayv1ac.RayServiceSpecApplyConfiguration { return rayv1ac.RayServiceSpec(). WithUpgradeStrategy(rayv1ac.RayServiceUpgradeStrategy(). - WithType(rayv1.IncrementalUpgrade). + WithType(rayv1.NewClusterWithIncrementalUpgrade). WithClusterUpgradeOptions( rayv1ac.ClusterUpgradeOptions(). WithGatewayClassName("istio"). From 71f19a947ce869b6c92a89bcf8ec4a7f52e68d79 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 22 Oct 2025 02:34:44 +0000 Subject: [PATCH 52/56] Clean up utils and add more comments Signed-off-by: Ryan O'Leary --- ray-operator/apis/ray/v1/rayservice_types.go | 8 ++ .../controllers/ray/common/association.go | 6 +- ray-operator/controllers/ray/common/job.go | 2 +- .../controllers/ray/common/service.go | 16 ++-- .../controllers/ray/rayservice_controller.go | 28 +++--- .../ray/rayservice_controller_unit_test.go | 10 +- ray-operator/controllers/ray/utils/util.go | 91 ++++++------------- .../controllers/ray/utils/util_test.go | 6 +- .../rayservice_incremental_upgrade_test.go | 2 +- 9 files changed, 69 insertions(+), 100 deletions(-) diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go index 15e9d6e895d..b8fb29a81ae 100644 --- a/ray-operator/apis/ray/v1/rayservice_types.go +++ b/ray-operator/apis/ray/v1/rayservice_types.go @@ -149,10 +149,18 @@ type RayServiceStatus struct { // Important: Run "make" to regenerate code after modifying this file // +optional Applications map[string]AppStatus `json:"applicationStatuses,omitempty"` + // TargetCapacity is the `target_capacity` percentage for all Serve replicas + // across the cluster for this RayService. The `num_replicas`, `min_replicas`, `max_replicas`, + // and `initial_replicas` for each deployment will be scaled by this percentage." // +optional TargetCapacity *int32 `json:"targetCapacity,omitempty"` + // TrafficRoutedPercent is the percentage of traffic that is routed to the Serve service + // for this RayService. TrafficRoutedPercent is updated to reflect the weight on the HTTPRoute + // created for this RayService during incremental upgrades to a new cluster. // +optional TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"` + // LastTrafficMigratedTime is the last time that TrafficRoutedPercent was updated to a new value + // for this RayService. // +optional LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"` // +optional diff --git a/ray-operator/controllers/ray/common/association.go b/ray-operator/controllers/ray/common/association.go index 922a31d924f..1539e49aa88 100644 --- a/ray-operator/controllers/ray/common/association.go +++ b/ray-operator/controllers/ray/common/association.go @@ -205,17 +205,15 @@ func RayClusterNetworkResourcesOptions(instance *rayv1.RayCluster) AssociationOp } func RayServiceGatewayNamespacedName(rayService *rayv1.RayService) types.NamespacedName { - gatewayName := utils.CheckGatewayName(fmt.Sprintf("%s-gateway", rayService.Name)) return types.NamespacedName{ - Name: gatewayName, + Name: fmt.Sprintf("%s-gateway", rayService.Name), Namespace: rayService.Namespace, } } func RayServiceHTTPRouteNamespacedName(rayService *rayv1.RayService) types.NamespacedName { - httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s-gateway", rayService.Name)) return types.NamespacedName{ - Name: httpRouteName, + Name: fmt.Sprintf("%s-httproute", rayService.Name), Namespace: rayService.Namespace, } } diff --git a/ray-operator/controllers/ray/common/job.go b/ray-operator/controllers/ray/common/job.go index 3cb070be168..05025a3e86e 100644 --- a/ray-operator/controllers/ray/common/job.go +++ b/ray-operator/controllers/ray/common/job.go @@ -91,7 +91,7 @@ func BuildJobSubmitCommand(rayJobInstance *rayv1.RayJob, submissionMode rayv1.Jo // The sidecar submitter shares the same network namespace as the Ray dashboard, // so it uses 127.0.0.1 to connect to the Ray dashboard. rayHeadContainer := rayJobInstance.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex] - port = utils.FindContainerPort(&rayHeadContainer, utils.DashboardPortName, utils.DefaultDashboardPort) + port = int(utils.FindContainerPort(&rayHeadContainer, utils.DashboardPortName, utils.DefaultDashboardPort)) address = "http://127.0.0.1:" + strconv.Itoa(port) case rayv1.K8sJobMode: // Submitter is a separate K8s Job; use cluster dashboard address. diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go index 6655f0b3a29..545b3a6ae98 100644 --- a/ray-operator/controllers/ray/common/service.go +++ b/ray-operator/controllers/ray/common/service.go @@ -328,15 +328,17 @@ func GetServePort(cluster *rayv1.RayCluster) gwv1.PortNumber { return gwv1.PortNumber(utils.DefaultServingPort) } + // Get the head container + headContainer := &cluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex] + // Find the port named "serve" in the head group's container spec. - headContainer := cluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex] - for _, port := range headContainer.Ports { - if port.Name == utils.ServingPortName { - return gwv1.PortNumber(port.ContainerPort) - } - } + port := utils.FindContainerPort( + headContainer, + utils.ServingPortName, + utils.DefaultServingPort, + ) - return gwv1.PortNumber(utils.DefaultServingPort) + return gwv1.PortNumber(port) } func setServiceTypeForUserProvidedService(ctx context.Context, service *corev1.Service, defaultType corev1.ServiceType) { diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index b85943c629b..a59ed0ffe00 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -262,20 +262,9 @@ func (r *RayServiceReconciler) reconcileServicesToReadyCluster(ctx context.Conte return headSvc, serveSvc, nil } -// promotePendingClusterToActive handles the logic for promoting the pending RayCluster to active in RayService status. -func (r *RayServiceReconciler) promotePendingClusterToActive(ctx context.Context, rayServiceInstance *rayv1.RayService) { - logger := ctrl.LoggerFrom(ctx) - logger.Info("Promoting pending cluster to active.", - "oldCluster", rayServiceInstance.Status.ActiveServiceStatus.RayClusterName, - "newCluster", rayServiceInstance.Status.PendingServiceStatus.RayClusterName) - - rayServiceInstance.Status.ActiveServiceStatus = rayServiceInstance.Status.PendingServiceStatus - rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{} -} - // reconcilePromotionAndServingStatus handles the promotion logic after an upgrade, returning // isPendingClusterServing: True if the main Kubernetes services are pointing to the pending cluster. -func (r *RayServiceReconciler) reconcilePromotionAndServingStatus(ctx context.Context, headSvc, serveSvc *corev1.Service, rayServiceInstance *rayv1.RayService, pendingCluster *rayv1.RayCluster) (isPendingClusterServing bool) { +func reconcilePromotionAndServingStatus(ctx context.Context, headSvc, serveSvc *corev1.Service, rayServiceInstance *rayv1.RayService, pendingCluster *rayv1.RayCluster) (isPendingClusterServing bool) { logger := ctrl.LoggerFrom(ctx) // Step 1: Service Consistency Check. Ensure head and serve services point to the @@ -318,7 +307,12 @@ func (r *RayServiceReconciler) reconcilePromotionAndServingStatus(ctx context.Co // Step 3: Promote the pending cluster if prior conditions are met. if shouldPromote { - r.promotePendingClusterToActive(ctx, rayServiceInstance) + logger.Info("Promoting pending cluster to active.", + "oldCluster", rayServiceInstance.Status.ActiveServiceStatus.RayClusterName, + "newCluster", rayServiceInstance.Status.PendingServiceStatus.RayClusterName) + + rayServiceInstance.Status.ActiveServiceStatus = rayServiceInstance.Status.PendingServiceStatus + rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{} } return (clusterSvcPointsTo == pendingClusterName) @@ -377,7 +371,7 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn } } // Reconcile serving status and promotion logic for all upgrade strategies. - isPendingClusterServing = r.reconcilePromotionAndServingStatus(ctx, headSvc, serveSvc, rayServiceInstance, pendingCluster) + isPendingClusterServing = reconcilePromotionAndServingStatus(ctx, headSvc, serveSvc, rayServiceInstance, pendingCluster) } if shouldPrepareNewCluster(ctx, rayServiceInstance, activeCluster, pendingCluster, isPendingClusterServing) { @@ -565,7 +559,7 @@ func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayServic return nil, errstd.New("Missing RayService ClusterUpgradeOptions during upgrade.") } - gatewayName := utils.CheckGatewayName(rayServiceInstance.Name + "-gateway") + gatewayName := rayServiceInstance.Name + "-gateway" // Define the desired Gateway object rayServiceGateway := &gwv1.Gateway{ ObjectMeta: metav1.ObjectMeta{ @@ -762,7 +756,7 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn }) } - httpRouteName := utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s", gatewayInstance.Name)) + httpRouteName := rayServiceInstance.Name + "-httproute" desiredHTTPRoute := &gwv1.HTTPRoute{ ObjectMeta: metav1.ObjectMeta{Name: httpRouteName, Namespace: gatewayInstance.Namespace}, Spec: gwv1.HTTPRouteSpec{ @@ -1679,7 +1673,7 @@ func (r *RayServiceReconciler) updateHeadPodServeLabel(ctx context.Context, rayS } rayContainer := headPod.Spec.Containers[utils.RayContainerIndex] - servingPort := utils.FindContainerPort(&rayContainer, utils.ServingPortName, utils.DefaultServingPort) + servingPort := int(utils.FindContainerPort(&rayContainer, utils.ServingPortName, utils.DefaultServingPort)) client := r.httpProxyClientFunc(headPod.Status.PodIP, headPod.Namespace, headPod.Name, servingPort) if headPod.Labels == nil { diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index dce09f5f424..85891b9d95f 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1587,7 +1587,7 @@ func TestCreateHTTPRoute(t *testing.T) { require.NoError(t, err) require.NotNil(t, route) - assert.Equal(t, "httproute-test-rayservice-gateway", route.Name) + assert.Equal(t, "test-rayservice-httproute", route.Name) assert.Equal(t, "test-ns", route.Namespace) require.Len(t, route.Spec.Rules, 1) @@ -1619,7 +1619,7 @@ func TestReconcileHTTPRoute(t *testing.T) { stepSize := int32(10) interval := int32(30) gatewayName := "test-rayservice-gateway" - routeName := fmt.Sprintf("httproute-%s", gatewayName) + routeName := "test-rayservice-httproute" activeCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "active-ray-cluster", Namespace: namespace}} pendingCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "pending-ray-cluster", Namespace: namespace}} @@ -1991,7 +1991,7 @@ func makeHTTPRoute(name, namespace string, isReady bool) *gwv1.HTTPRoute { func TestCheckIfNeedTargetCapacityUpdate(t *testing.T) { rayServiceName := "test-rayservice" gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway") - httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName) + httpRouteName := fmt.Sprintf("%s-%s", rayServiceName, "httproute") namespace := "test-ns" tests := []struct { @@ -2236,7 +2236,7 @@ func TestGetHTTPRouteTrafficWeights(t *testing.T) { rayServiceName := "test-rayservice" activeClusterName := "rayservice-active" pendingClusterName := "rayservice-pending" - routeName := "httproute-test-rayservice-gateway" + routeName := "test-rayservice-httproute" baseRayService := &rayv1.RayService{ ObjectMeta: metav1.ObjectMeta{Name: rayServiceName, Namespace: namespace}, @@ -2326,7 +2326,7 @@ func TestGetHTTPRouteTrafficWeights(t *testing.T) { t.Run(tt.name, func(t *testing.T) { runtimeObjects := []runtime.Object{tt.rayService} if tt.httpRoute != nil { - tt.httpRoute.Name = utils.CheckHTTPRouteName(fmt.Sprintf("httproute-%s-gateway", tt.rayService.Name)) + tt.httpRoute.Name = fmt.Sprintf("%s-httproute", tt.rayService.Name) runtimeObjects = append(runtimeObjects, tt.httpRoute) } fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index e1f2b59062d..65df55a83d1 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -16,6 +16,7 @@ import ( batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + meta "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/json" @@ -212,40 +213,6 @@ func CheckName(s string) string { return s } -func CheckGatewayName(name string) string { - const maxLength = 63 - - if len(name) > maxLength { - offset := len(name) - maxLength - fmt.Printf("Gateway name too long (len = %d), shortening by offset = %d", len(name), offset) - name = name[offset:] - } - - // Cannot start with a digit or punctuation - if len(name) > 0 && (unicode.IsDigit(rune(name[0])) || unicode.IsPunct(rune(name[0]))) { - name = "g" + name[1:] - } - - return name -} - -func CheckHTTPRouteName(name string) string { - const maxLength = 63 - - if len(name) > maxLength { - offset := len(name) - maxLength - fmt.Printf("HTTPRoute name too long (len = %d), shortening by offset = %d", len(name), offset) - name = name[offset:] - } - - // Cannot start with a digit or punctuation - if len(name) > 0 && (unicode.IsDigit(rune(name[0])) || unicode.IsPunct(rune(name[0]))) { - name = "h" + name[1:] - } - - return name -} - // TrimJobName uses CheckLabel to trim Kubernetes job to constrains func TrimJobName(jobName string) string { return CheckLabel(jobName) @@ -636,10 +603,10 @@ func GenerateJsonHash(obj interface{}) (string, error) { // FindContainerPort searches for a specific port $portName in the container. // If the port is found in the container, the corresponding port is returned. // If the port is not found, the $defaultPort is returned instead. -func FindContainerPort(container *corev1.Container, portName string, defaultPort int) int { +func FindContainerPort(container *corev1.Container, portName string, defaultPort int32) int32 { for _, port := range container.Ports { if port.Name == portName { - return int(port.ContainerPort) + return port.ContainerPort } } return defaultPort @@ -712,27 +679,39 @@ func GetRayClusterNameFromService(svc *corev1.Service) string { return svc.Spec.Selector[RayClusterLabelKey] } +// IsGatewayReady checks if a Gateway is considered "ready". +// +// A Gateway is "ready" only if both the `Accepted` and `Programmed` conditions +// are set to 'True'. +// +// 1. 'Accepted': Signifies that the Gateway controller understands and accepts +// the Gateway resource. If 'False', it often indicates a conflict or an invalid +// specification. +// +// 2. 'Programmed': Signifies that the underlying network infrastructure for the Gateway +// (e.g. load balancer) has been successfully provisioned and configured. func IsGatewayReady(gatewayInstance *gwv1.Gateway) bool { if gatewayInstance == nil { return false } - hasAccepted := false - hasProgrammed := false - for _, condition := range gatewayInstance.Status.Conditions { - if condition.Type == string(gwv1.GatewayConditionAccepted) && condition.Status == metav1.ConditionTrue { - hasAccepted = true - } - if condition.Type == string(gwv1.GatewayConditionProgrammed) && condition.Status == metav1.ConditionTrue { - hasProgrammed = true - } - } + hasAccepted := meta.IsStatusConditionTrue(gatewayInstance.Status.Conditions, string(gwv1.GatewayConditionAccepted)) + hasProgrammed := meta.IsStatusConditionTrue(gatewayInstance.Status.Conditions, string(gwv1.GatewayConditionProgrammed)) - // If no ready condition found return false return hasAccepted && hasProgrammed } -// IsHTTPRouteReady returns whether the HTTPRoute associated with a given Gateway has a ready condition +// IsHTTPRouteReady checks if an HTTPRoute is considered ready for a given Gateway. +// +// It returns true only if the route's parent status entry matching the Gateway has both +// the 'Accepted' and 'ResolvedRefs' conditions set to 'True'. +// +// 1. 'Accepted': Signifies that the Gateway controller has validated the HTTPRoute's +// configuration (e.g. syntax, filters, matching rules). An 'Accepted' status of +// 'False' means the route's specification is invalid. +// +// 2. 'ResolvedRefs': Signifies that all references within the route are valid, exist, +// and are resolvable by the Gateway. func IsHTTPRouteReady(gatewayInstance *gwv1.Gateway, httpRouteInstance *gwv1.HTTPRoute) bool { if httpRouteInstance == nil { return false @@ -744,21 +723,9 @@ func IsHTTPRouteReady(gatewayInstance *gwv1.Gateway, httpRouteInstance *gwv1.HTT if parent.ParentRef.Namespace != nil && *parent.ParentRef.Namespace != gwv1.Namespace(gatewayInstance.Namespace) { continue } - hasAccepted := false - hasResolved := false + hasAccepted := meta.IsStatusConditionTrue(parent.Conditions, string(gwv1.RouteConditionAccepted)) + hasResolved := meta.IsStatusConditionTrue(parent.Conditions, string(gwv1.RouteConditionResolvedRefs)) - for _, condition := range parent.Conditions { - switch gwv1.RouteConditionType(condition.Type) { - case gwv1.RouteConditionAccepted: - if condition.Status == metav1.ConditionTrue { - hasAccepted = true - } - case gwv1.RouteConditionResolvedRefs: - if condition.Status == metav1.ConditionTrue { - hasResolved = true - } - } - } if hasAccepted && hasResolved { return true } diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index a4bb526f8df..8bd37a2e7f8 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -488,11 +488,11 @@ func TestFindContainerPort(t *testing.T) { }, } port := FindContainerPort(&container, "port1", -1) - assert.NotEqual(t, port, -1, "expect port1 found") + assert.NotEqual(t, port, int32(-1), "expect port1 found") port = FindContainerPort(&container, "port2", -1) - assert.NotEqual(t, port, -1, "expect port2 found") + assert.NotEqual(t, port, int32(-1), "expect port2 found") port = FindContainerPort(&container, "port3", -1) - assert.Equal(t, port, -1, "expect port3 not found") + assert.Equal(t, port, int32(-1), "expect port3 not found") } func TestGenerateHeadServiceName(t *testing.T) { diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go index f20ea2422a2..e9290bedda3 100644 --- a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go +++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go @@ -70,7 +70,7 @@ func TestRayServiceIncrementalUpgrade(t *testing.T) { g.Expect(err).NotTo(HaveOccurred()) g.Expect(gateway).NotTo(BeNil()) - httpRouteName := fmt.Sprintf("%s-%s", "httproute", gatewayName) + httpRouteName := fmt.Sprintf("%s-%s", rayServiceName, "httproute") LogWithTimestamp(test.T(), "Waiting for HTTPRoute %s/%s to be ready", rayService.Namespace, httpRouteName) g.Eventually(HTTPRoute(test, rayService.Namespace, httpRouteName), TestTimeoutMedium). Should(Not(BeNil())) From c23f9014a81fcb360d9214351f9f072d31be320a Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 22 Oct 2025 03:47:23 +0000 Subject: [PATCH 53/56] reconcileHTTPRoute should pass created object to calculate status Signed-off-by: Ryan O'Leary --- .../controllers/ray/rayservice_controller.go | 58 ++++----- .../ray/rayservice_controller_unit_test.go | 123 +----------------- ray-operator/controllers/ray/utils/util.go | 3 +- 3 files changed, 25 insertions(+), 159 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index a59ed0ffe00..2d077258a28 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -177,6 +177,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque } // Check if NewClusterWithIncrementalUpgrade is enabled, if so reconcile Gateway objects. + var httpRouteInstance *gwv1.HTTPRoute if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) { // Ensure per-cluster Serve service exists for the active and pending RayClusters. if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, activeRayClusterInstance); err != nil { @@ -193,7 +194,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) } // Create or update the HTTPRoute for the Gateway, passing in the pending cluster readiness status. - err = r.reconcileHTTPRoute(ctx, rayServiceInstance, isPendingClusterReady) + httpRouteInstance, err = r.reconcileHTTPRoute(ctx, rayServiceInstance, isPendingClusterReady) if err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err) } @@ -235,6 +236,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque pendingRayClusterInstance, activeClusterServeApplications, pendingClusterServeApplications, + httpRouteInstance, ); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } @@ -318,7 +320,14 @@ func reconcilePromotionAndServingStatus(ctx context.Context, headSvc, serveSvc * return (clusterSvcPointsTo == pendingClusterName) } -func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceInstance *rayv1.RayService, headSvc, serveSvc *corev1.Service, activeCluster, pendingCluster *rayv1.RayCluster, activeClusterServeApplications, pendingClusterServeApplications map[string]rayv1.AppStatus) error { +func (r *RayServiceReconciler) calculateStatus( + ctx context.Context, + rayServiceInstance *rayv1.RayService, + headSvc, serveSvc *corev1.Service, + activeCluster, pendingCluster *rayv1.RayCluster, + activeClusterServeApplications, pendingClusterServeApplications map[string]rayv1.AppStatus, + httpRoute *gwv1.HTTPRoute, +) error { logger := ctrl.LoggerFrom(ctx) rayServiceInstance.Status.ObservedGeneration = rayServiceInstance.ObjectMeta.Generation @@ -345,13 +354,8 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn oldActivePercent := ptr.Deref(rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent, -1) oldPendingPercent := ptr.Deref(rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent, -1) - // Update traffic weights for incremental upgrade by fetching the current HTTPRoute. - activeWeight, pendingWeight, err := r.getHTTPRouteTrafficWeights(ctx, rayServiceInstance) - if err != nil { - logger.Error(err, "Failed to get HTTPRoute traffic weights.") - return err - } - + // Update TrafficRoutedPercent to each RayService based on current weights from HTTPRoute. + activeWeight, pendingWeight := utils.GetWeightsFromHTTPRoute(httpRoute, rayServiceInstance) now := metav1.Time{Time: time.Now()} if activeWeight >= 0 { rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(activeWeight) @@ -788,18 +792,18 @@ func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceIn } // reconcileHTTPRoute reconciles a HTTPRoute resource for a RayService to route traffic during a NewClusterWithIncrementalUpgrade. -func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService, isPendingClusterReady bool) error { +func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService, isPendingClusterReady bool) (*gwv1.HTTPRoute, error) { logger := ctrl.LoggerFrom(ctx) var err error desiredHTTPRoute, err := r.createHTTPRoute(ctx, rayServiceInstance, isPendingClusterReady) if err != nil { logger.Error(err, "Failed to build HTTPRoute for RayService upgrade") - return err + return nil, err } if desiredHTTPRoute == nil { logger.Info("Skipping HTTPRoute reconciliation: desired HTTPRoute is nil") - return nil + return nil, nil } // Check for existing HTTPRoute for RayService @@ -808,16 +812,16 @@ func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServic if errors.IsNotFound(err) { // Set the ownership in order to do the garbage collection by k8s. if err := ctrl.SetControllerReference(rayServiceInstance, desiredHTTPRoute, r.Scheme); err != nil { - return err + return nil, err } if err = r.Create(ctx, desiredHTTPRoute); err != nil { r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateHTTPRoute), "Failed to create the HTTPRoute for RayService %s/%s: %v", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name, err) - return err + return nil, err } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.CreatedHTTPRoute), "Created HTTPRoute for RayService %s/%s", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name) - return nil + return desiredHTTPRoute, nil } - return err + return nil, err } // If HTTPRoute already exists, check if update is needed @@ -826,30 +830,12 @@ func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServic existingHTTPRoute.Spec = desiredHTTPRoute.Spec if err := r.Update(ctx, existingHTTPRoute); err != nil { r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateHTTPRoute), "Failed to update the HTTPRoute %s/%s: %v", existingHTTPRoute.Namespace, existingHTTPRoute.Name, err) - return err + return nil, err } r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedHTTPRoute), "Updated the HTTPRoute %s/%s", existingHTTPRoute.Namespace, existingHTTPRoute.Name) } - return nil -} - -// getHTTPRouteTrafficWeights fetches the HTTPRoute associated with a RayService and returns -// the traffic weights for the active and pending clusters. -func (r *RayServiceReconciler) getHTTPRouteTrafficWeights(ctx context.Context, rayServiceInstance *rayv1.RayService) (activeWeight int32, pendingWeight int32, err error) { - activeWeight, pendingWeight = 100, 0 - - httpRoute := &gwv1.HTTPRoute{} - if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), httpRoute); err != nil { - if errors.IsNotFound(err) { - // If HTTPRoute doesn't exist yet, return the default weights. - return activeWeight, pendingWeight, nil - } - return 0, 0, err - } - activeWeight, pendingWeight = utils.GetWeightsFromHTTPRoute(httpRoute, rayServiceInstance) - - return activeWeight, pendingWeight, nil + return existingHTTPRoute, nil } // `reconcileRayCluster` reconciles the active and pending Ray clusters. There are 4 possible cases: diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 85891b9d95f..48703c6b3ea 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1735,13 +1735,9 @@ func TestReconcileHTTPRoute(t *testing.T) { fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() reconciler := RayServiceReconciler{Client: fakeClient, Scheme: newScheme, Recorder: record.NewFakeRecorder(10)} - err := reconciler.reconcileHTTPRoute(ctx, rayService, tt.isPendingClusterReady) + reconciledRoute, err := reconciler.reconcileHTTPRoute(ctx, rayService, tt.isPendingClusterReady) require.NoError(t, err) - reconciledRoute := &gwv1.HTTPRoute{} - err = fakeClient.Get(ctx, client.ObjectKey{Name: routeName, Namespace: namespace}, reconciledRoute) - require.NoError(t, err, "Failed to fetch the reconciled HTTPRoute") - require.Len(t, reconciledRoute.Spec.Rules, 1) rule := reconciledRoute.Spec.Rules[0] if tt.pendingClusterExists { @@ -2230,120 +2226,3 @@ func TestReconcilePerClusterServeService(t *testing.T) { }) } } - -func TestGetHTTPRouteTrafficWeights(t *testing.T) { - namespace := "test-ns" - rayServiceName := "test-rayservice" - activeClusterName := "rayservice-active" - pendingClusterName := "rayservice-pending" - routeName := "test-rayservice-httproute" - - baseRayService := &rayv1.RayService{ - ObjectMeta: metav1.ObjectMeta{Name: rayServiceName, Namespace: namespace}, - Status: rayv1.RayServiceStatuses{ - ActiveServiceStatus: rayv1.RayServiceStatus{RayClusterName: activeClusterName}, - PendingServiceStatus: rayv1.RayServiceStatus{RayClusterName: pendingClusterName}, - }, - } - - tests := []struct { - rayService *rayv1.RayService - httpRoute *gwv1.HTTPRoute - name string - expectedActiveWeight int32 - expectedPendingWeight int32 - expectError bool - }{ - { - name: "HTTPRoute does not exist", - rayService: baseRayService, - httpRoute: nil, - expectedActiveWeight: 100, - expectedPendingWeight: 0, - expectError: false, - }, - { - name: "HTTPRoute exists with active cluster backend", - rayService: baseRayService, - httpRoute: &gwv1.HTTPRoute{ - ObjectMeta: metav1.ObjectMeta{Name: routeName, Namespace: namespace}, - Spec: gwv1.HTTPRouteSpec{ - Rules: []gwv1.HTTPRouteRule{ - { - BackendRefs: []gwv1.HTTPBackendRef{ - { - BackendRef: gwv1.BackendRef{ - BackendObjectReference: gwv1.BackendObjectReference{Name: gwv1.ObjectName(utils.GenerateServeServiceName(activeClusterName))}, - Weight: ptr.To(int32(100)), - }, - }, - }, - }, - }, - }, - }, - expectedActiveWeight: 100, - expectedPendingWeight: -1, - expectError: false, - }, - { - name: "HTTPRoute exists with active and pending cluster backends", - rayService: baseRayService, - httpRoute: &gwv1.HTTPRoute{ - ObjectMeta: metav1.ObjectMeta{Name: routeName, Namespace: namespace}, - Spec: gwv1.HTTPRouteSpec{ - Rules: []gwv1.HTTPRouteRule{ - { - BackendRefs: []gwv1.HTTPBackendRef{ - { - BackendRef: gwv1.BackendRef{ - BackendObjectReference: gwv1.BackendObjectReference{Name: gwv1.ObjectName(utils.GenerateServeServiceName(activeClusterName))}, - Weight: ptr.To(int32(80)), - }, - }, - { - BackendRef: gwv1.BackendRef{ - BackendObjectReference: gwv1.BackendObjectReference{Name: gwv1.ObjectName(utils.GenerateServeServiceName(pendingClusterName))}, - Weight: ptr.To(int32(20)), - }, - }, - }, - }, - }, - }, - }, - expectedActiveWeight: 80, - expectedPendingWeight: 20, - expectError: false, - }, - } - - newScheme := runtime.NewScheme() - _ = rayv1.AddToScheme(newScheme) - _ = gwv1.AddToScheme(newScheme) - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - runtimeObjects := []runtime.Object{tt.rayService} - if tt.httpRoute != nil { - tt.httpRoute.Name = fmt.Sprintf("%s-httproute", tt.rayService.Name) - runtimeObjects = append(runtimeObjects, tt.httpRoute) - } - fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build() - - r := RayServiceReconciler{Client: fakeClient} - ctx := context.TODO() - - // Validates retried weights match what is expected. - activeWeight, pendingWeight, err := r.getHTTPRouteTrafficWeights(ctx, tt.rayService) - - if tt.expectError { - require.Error(t, err) - } else { - require.NoError(t, err) - assert.Equal(t, tt.expectedActiveWeight, activeWeight) - assert.Equal(t, tt.expectedPendingWeight, pendingWeight) - } - }) - } -} diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 65df55a83d1..64707b074ea 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -764,7 +764,8 @@ func GetWeightsFromHTTPRoute(httpRoute *gwv1.HTTPRoute, rayServiceInstance *rayv pendingClusterName = rayServiceInstance.Status.PendingServiceStatus.RayClusterName } - // Defaults if weights can't be detected. + // Defaults if weights can't be detected. This is so that we avoid setting TrafficRoutedPercent + // before the HTTPRoute actually exists. activeWeight = -1 pendingWeight = -1 From f04fee1a2743250f68c01bd1ddae3b786614500a Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 23 Oct 2025 01:41:56 +0000 Subject: [PATCH 54/56] lint Signed-off-by: Ryan O'Leary --- ray-operator/pkg/features/features.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ray-operator/pkg/features/features.go b/ray-operator/pkg/features/features.go index 0522280c368..16b23ab83ac 100644 --- a/ray-operator/pkg/features/features.go +++ b/ray-operator/pkg/features/features.go @@ -30,8 +30,8 @@ const ( // alpha: v1.0 // Enables multi-host worker indexing RayMultiHostIndexing featuregate.Feature = "RayMultiHostIndexing" - - // owner: @ryanaoleary + + // owner: @ryanaoleary // rep: N/A // alpha: v1.0 // @@ -44,10 +44,10 @@ func init() { } var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{ - RayClusterStatusConditions: {Default: true, PreRelease: featuregate.Beta}, - RayJobDeletionPolicy: {Default: false, PreRelease: featuregate.Alpha}, - RayMultiHostIndexing: {Default: false, PreRelease: featuregate.Alpha}, - RayServiceIncrementalUpgrade: {Default: false, PreRelease: featuregate.Alpha}, + RayClusterStatusConditions: {Default: true, PreRelease: featuregate.Beta}, + RayJobDeletionPolicy: {Default: false, PreRelease: featuregate.Alpha}, + RayMultiHostIndexing: {Default: false, PreRelease: featuregate.Alpha}, + RayServiceIncrementalUpgrade: {Default: false, PreRelease: featuregate.Alpha}, } // SetFeatureGateDuringTest is a helper method to override feature gates in tests. From 18be954dc05ccedb15bf2ebfb49c3eb6a00a3294 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> Date: Thu, 23 Oct 2025 12:52:11 -0700 Subject: [PATCH 55/56] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: Han-Ju Chen (Future-Outlier) Signed-off-by: Ryan O'Leary <113500783+ryanaoleary@users.noreply.github.com> --- ray-operator/controllers/ray/rayservice_controller.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 2d077258a28..26ff10a6373 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -649,8 +649,8 @@ func (r *RayServiceReconciler) calculateTrafficRoutedPercent(ctx context.Context pendingServiceStatus := &rayServiceInstance.Status.PendingServiceStatus // Default to 100% traffic on the active cluster. - activeClusterWeight = 100 - pendingClusterWeight = 0 + activeClusterWeight = ptr.Deref(activeServiceStatus.TrafficRoutedPercent, 100) + pendingClusterWeight = ptr.Deref(pendingServiceStatus.TrafficRoutedPercent, 0) if isPendingClusterReady { // Zero-downtime upgrade in progress. @@ -660,9 +660,7 @@ func (r *RayServiceReconciler) calculateTrafficRoutedPercent(ctx context.Context } // Check that target_capacity has been updated before migrating traffic. - pendingClusterWeight = ptr.Deref(pendingServiceStatus.TrafficRoutedPercent, 0) pendingClusterTargetCapacity := ptr.Deref(pendingServiceStatus.TargetCapacity, 0) - activeClusterWeight = ptr.Deref(activeServiceStatus.TrafficRoutedPercent, 100) if pendingClusterWeight == pendingClusterTargetCapacity { // Stop traffic migration because the pending cluster's current traffic weight has reached its target capacity limit. From 20736804317c94e7ef0b1f5a42f89f87a738546d Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 23 Oct 2025 23:35:33 +0000 Subject: [PATCH 56/56] Fix test after suggested fix Signed-off-by: Ryan O'Leary --- .../ray/rayservice_controller_unit_test.go | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 48703c6b3ea..169e6d2bc5d 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -1642,12 +1642,12 @@ func TestReconcileHTTPRoute(t *testing.T) { Status: rayv1.RayServiceStatuses{ ActiveServiceStatus: rayv1.RayServiceStatus{ RayClusterName: activeCluster.Name, - TrafficRoutedPercent: ptr.To(int32(80)), + TrafficRoutedPercent: ptr.To(int32(100)), TargetCapacity: ptr.To(int32(100)), }, PendingServiceStatus: rayv1.RayServiceStatus{ RayClusterName: pendingCluster.Name, - TrafficRoutedPercent: ptr.To(int32(20)), + TrafficRoutedPercent: ptr.To(int32(0)), TargetCapacity: ptr.To(int32(100)), }, }, @@ -1680,15 +1680,15 @@ func TestReconcileHTTPRoute(t *testing.T) { name: "Create new HTTPRoute with existing weights.", isPendingClusterReady: true, pendingClusterExists: true, - expectedActiveWeight: 70, - expectedPendingWeight: 30, + expectedActiveWeight: 90, + expectedPendingWeight: 10, }, { name: "Update HTTPRoute when pending cluster is ready.", isPendingClusterReady: true, pendingClusterExists: true, - expectedActiveWeight: 70, - expectedPendingWeight: 30, + expectedActiveWeight: 90, + expectedPendingWeight: 10, }, { name: "Existing HTTPRoute, time since LastTrafficMigratedTime >= IntervalSeconds so updates HTTPRoute.", @@ -1701,8 +1701,8 @@ func TestReconcileHTTPRoute(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: routeName, Namespace: namespace}, Spec: gwv1.HTTPRouteSpec{}, }, - expectedActiveWeight: 70, - expectedPendingWeight: 30, + expectedActiveWeight: 90, + expectedPendingWeight: 10, }, { name: "Existing HTTPRoute, time since LastTrafficMigratedTime < IntervalSeconds so no update.", @@ -1711,8 +1711,8 @@ func TestReconcileHTTPRoute(t *testing.T) { modifier: func(rs *rayv1.RayService) { rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()} }, - expectedActiveWeight: 80, - expectedPendingWeight: 20, + expectedActiveWeight: 100, + expectedPendingWeight: 0, }, }