diff --git a/docs/reference/api.md b/docs/reference/api.md
index c7d9e46ffda..dc621718f0a 100644
--- a/docs/reference/api.md
+++ b/docs/reference/api.md
@@ -55,6 +55,25 @@ _Appears in:_
+#### ClusterUpgradeOptions
+
+
+
+These options are currently only supported for the IncrementalUpgrade type.
+
+
+
+_Appears in:_
+- [RayServiceUpgradeStrategy](#rayserviceupgradestrategy)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `maxSurgePercent` _integer_ | The capacity of serve requests the upgraded cluster should scale to handle each interval.
Defaults to 100%. | 100 | |
+| `stepSizePercent` _integer_ | The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent. | | |
+| `intervalSeconds` _integer_ | The interval in seconds between transferring StepSize traffic from the old to new RayCluster. | | |
+| `gatewayClassName` _string_ | The name of the Gateway Class installed by the Kubernetes Cluster admin. | | |
+
+
#### DeletionCondition
@@ -377,6 +396,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | |
+| `clusterUpgradeOptions` _[ClusterUpgradeOptions](#clusterupgradeoptions)_ | ClusterUpgradeOptions defines the behavior of a NewClusterWithIncrementalUpgrade type.
RayServiceIncrementalUpgrade feature gate must be enabled to set ClusterUpgradeOptions. | | |
#### RayServiceUpgradeType
diff --git a/go.mod b/go.mod
index 472e6d593df..e93dc132eda 100644
--- a/go.mod
+++ b/go.mod
@@ -73,7 +73,7 @@ require (
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
- github.com/mattn/go-isatty v0.0.19 // indirect
+ github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mitchellh/go-wordwrap v1.0.1 // indirect
github.com/moby/spdystream v0.5.0 // indirect
github.com/moby/term v0.5.0 // indirect
@@ -95,12 +95,12 @@ require (
go.uber.org/automaxprocs v1.6.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
- golang.org/x/net v0.38.0 // indirect
+ golang.org/x/net v0.39.0 // indirect
golang.org/x/oauth2 v0.27.0 // indirect
- golang.org/x/sync v0.12.0 // indirect
+ golang.org/x/sync v0.13.0 // indirect
golang.org/x/sys v0.32.0 // indirect
- golang.org/x/term v0.30.0 // indirect
- golang.org/x/text v0.23.0 // indirect
+ golang.org/x/term v0.31.0 // indirect
+ golang.org/x/text v0.24.0 // indirect
golang.org/x/time v0.10.0 // indirect
golang.org/x/tools v0.31.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
@@ -112,6 +112,7 @@ require (
k8s.io/component-base v0.33.1 // indirect
k8s.io/component-helpers v0.33.1 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
+ sigs.k8s.io/gateway-api v1.3.0 // indirect
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
sigs.k8s.io/kustomize/api v0.19.0 // indirect
sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect
diff --git a/go.sum b/go.sum
index dddab9f7e86..22e4f1113d9 100644
--- a/go.sum
+++ b/go.sum
@@ -139,8 +139,9 @@ github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUt
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
-github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0=
github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0=
github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU=
@@ -263,8 +264,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
-golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
-golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
+golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY=
+golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M=
golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8=
@@ -274,8 +275,8 @@ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
-golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610=
+golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -292,12 +293,12 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
-golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
+golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o=
+golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
-golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
+golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
+golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=
golang.org/x/time v0.10.0 h1:3usCWA8tQn0L8+hFJQNgzpWbd89begxN66o1Ojdn5L4=
golang.org/x/time v0.10.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -380,6 +381,8 @@ k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97
k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8=
sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM=
+sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M=
+sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk=
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE=
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
sigs.k8s.io/kustomize/api v0.19.0 h1:F+2HB2mU1MSiR9Hp1NEgoU2q9ItNOaBJl0I4Dlus5SQ=
diff --git a/helm-chart/kuberay-operator/README.md b/helm-chart/kuberay-operator/README.md
index 2a50677e9e1..ecc0f8cf988 100644
--- a/helm-chart/kuberay-operator/README.md
+++ b/helm-chart/kuberay-operator/README.md
@@ -174,6 +174,8 @@ spec:
| featureGates[1].enabled | bool | `false` | |
| featureGates[2].name | string | `"RayMultiHostIndexing"` | |
| featureGates[2].enabled | bool | `false` | |
+| featureGates[3].name | string | `"RayServiceIncrementalUpgrade"` | |
+| featureGates[3].enabled | bool | `false` | |
| metrics.enabled | bool | `true` | Whether KubeRay operator should emit control plane metrics. |
| metrics.serviceMonitor.enabled | bool | `false` | Enable a prometheus ServiceMonitor |
| metrics.serviceMonitor.interval | string | `"30s"` | Prometheus ServiceMonitor interval |
diff --git a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml
index e2d61172a3c..267de9a20f8 100644
--- a/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml
+++ b/helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml
@@ -8241,6 +8241,25 @@ spec:
type: integer
upgradeStrategy:
properties:
+ clusterUpgradeOptions:
+ properties:
+ gatewayClassName:
+ type: string
+ intervalSeconds:
+ format: int32
+ type: integer
+ maxSurgePercent:
+ default: 100
+ format: int32
+ type: integer
+ stepSizePercent:
+ format: int32
+ type: integer
+ required:
+ - gatewayClassName
+ - intervalSeconds
+ - stepSizePercent
+ type: object
type:
type: string
type: object
@@ -8269,6 +8288,9 @@ spec:
type: string
type: object
type: object
+ lastTrafficMigratedTime:
+ format: date-time
+ type: string
rayClusterName:
type: string
rayClusterStatus:
@@ -8383,6 +8405,12 @@ spec:
type: string
type: object
type: object
+ targetCapacity:
+ format: int32
+ type: integer
+ trafficRoutedPercent:
+ format: int32
+ type: integer
type: object
conditions:
items:
@@ -8452,6 +8480,9 @@ spec:
type: string
type: object
type: object
+ lastTrafficMigratedTime:
+ format: date-time
+ type: string
rayClusterName:
type: string
rayClusterStatus:
@@ -8566,6 +8597,12 @@ spec:
type: string
type: object
type: object
+ targetCapacity:
+ format: int32
+ type: integer
+ trafficRoutedPercent:
+ format: int32
+ type: integer
type: object
serviceStatus:
type: string
diff --git a/helm-chart/kuberay-operator/templates/_helpers.tpl b/helm-chart/kuberay-operator/templates/_helpers.tpl
index 5d14510a61b..d5e0e7352d0 100644
--- a/helm-chart/kuberay-operator/templates/_helpers.tpl
+++ b/helm-chart/kuberay-operator/templates/_helpers.tpl
@@ -222,6 +222,17 @@ rules:
- patch
- update
- watch
+- apiGroups:
+ - gateway.networking.k8s.io
+ resources:
+ - gateways
+ - httproutes
+ verbs:
+ - create
+ - get
+ - list
+ - update
+ - watch
- apiGroups:
- networking.k8s.io
resources:
diff --git a/helm-chart/kuberay-operator/values.yaml b/helm-chart/kuberay-operator/values.yaml
index 3bc1d2765c4..1b7b46020b0 100644
--- a/helm-chart/kuberay-operator/values.yaml
+++ b/helm-chart/kuberay-operator/values.yaml
@@ -119,6 +119,8 @@ featureGates:
enabled: false
- name: RayMultiHostIndexing
enabled: false
+- name: RayServiceIncrementalUpgrade
+ enabled: false
# Configurations for KubeRay operator metrics.
metrics:
diff --git a/ray-operator/Makefile b/ray-operator/Makefile
index faab31894b5..1ef2ad7e6db 100644
--- a/ray-operator/Makefile
+++ b/ray-operator/Makefile
@@ -75,8 +75,16 @@ test-e2e-autoscaler: WHAT ?= ./test/e2eautoscaler
test-e2e-autoscaler: manifests fmt vet ## Run e2e autoscaler tests.
go test -timeout 30m -v $(WHAT)
+test-e2e-rayservice: WHAT ?= ./test/e2erayservice
+test-e2e-rayservice: manifests fmt vet ## Run e2e RayService tests.
+ go test -timeout 30m -v $(WHAT)
+
test-e2e-upgrade: WHAT ?= ./test/e2eupgrade
-test-e2e-upgrade: manifests fmt vet ## Run e2e tests.
+test-e2e-upgrade: manifests fmt vet ## Run e2e operator upgrade tests.
+ go test -timeout 30m -v $(WHAT)
+
+test-e2e-incremental-upgrade: WHAT ?= ./test/e2eincrementalupgrade
+test-e2e-incremental-upgrade: manifests fmt vet ## Run e2e RayService incremental upgrade tests.
go test -timeout 30m -v $(WHAT)
test-e2e-rayjob-submitter: WHAT ?= ./test/e2erayjobsubmitter
diff --git a/ray-operator/apis/ray/v1/rayservice_types.go b/ray-operator/apis/ray/v1/rayservice_types.go
index e7d73e07d8e..b8fb29a81ae 100644
--- a/ray-operator/apis/ray/v1/rayservice_types.go
+++ b/ray-operator/apis/ray/v1/rayservice_types.go
@@ -22,6 +22,9 @@ const (
type RayServiceUpgradeType string
const (
+ // During upgrade, NewClusterWithIncrementalUpgrade strategy will create an upgraded cluster to gradually scale
+ // and migrate traffic to using Gateway API.
+ NewClusterWithIncrementalUpgrade RayServiceUpgradeType = "NewClusterWithIncrementalUpgrade"
// During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready
NewCluster RayServiceUpgradeType = "NewCluster"
// No new cluster will be created while the strategy is set to None
@@ -57,10 +60,27 @@ var DeploymentStatusEnum = struct {
UNHEALTHY: "UNHEALTHY",
}
+// These options are currently only supported for the IncrementalUpgrade type.
+type ClusterUpgradeOptions struct {
+ // The capacity of serve requests the upgraded cluster should scale to handle each interval.
+ // Defaults to 100%.
+ // +kubebuilder:default:=100
+ MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"`
+ // The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent.
+ StepSizePercent *int32 `json:"stepSizePercent"`
+ // The interval in seconds between transferring StepSize traffic from the old to new RayCluster.
+ IntervalSeconds *int32 `json:"intervalSeconds"`
+ // The name of the Gateway Class installed by the Kubernetes Cluster admin.
+ GatewayClassName string `json:"gatewayClassName"`
+}
+
type RayServiceUpgradeStrategy struct {
// Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`.
// +optional
Type *RayServiceUpgradeType `json:"type,omitempty"`
+ // ClusterUpgradeOptions defines the behavior of a NewClusterWithIncrementalUpgrade type.
+ // RayServiceIncrementalUpgrade feature gate must be enabled to set ClusterUpgradeOptions.
+ ClusterUpgradeOptions *ClusterUpgradeOptions `json:"clusterUpgradeOptions,omitempty"`
}
// RayServiceSpec defines the desired state of RayService
@@ -129,6 +149,20 @@ type RayServiceStatus struct {
// Important: Run "make" to regenerate code after modifying this file
// +optional
Applications map[string]AppStatus `json:"applicationStatuses,omitempty"`
+ // TargetCapacity is the `target_capacity` percentage for all Serve replicas
+ // across the cluster for this RayService. The `num_replicas`, `min_replicas`, `max_replicas`,
+ // and `initial_replicas` for each deployment will be scaled by this percentage."
+ // +optional
+ TargetCapacity *int32 `json:"targetCapacity,omitempty"`
+ // TrafficRoutedPercent is the percentage of traffic that is routed to the Serve service
+ // for this RayService. TrafficRoutedPercent is updated to reflect the weight on the HTTPRoute
+ // created for this RayService during incremental upgrades to a new cluster.
+ // +optional
+ TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"`
+ // LastTrafficMigratedTime is the last time that TrafficRoutedPercent was updated to a new value
+ // for this RayService.
+ // +optional
+ LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"`
// +optional
RayClusterName string `json:"rayClusterName,omitempty"`
// +optional
@@ -184,8 +218,7 @@ const (
type RayService struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
-
- Spec RayServiceSpec `json:"spec,omitempty"`
+ Spec RayServiceSpec `json:"spec,omitempty"`
// +optional
Status RayServiceStatuses `json:"status,omitempty"`
}
diff --git a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
index 5a6ce86bc10..8deb750000c 100644
--- a/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
+++ b/ray-operator/apis/ray/v1/zz_generated.deepcopy.go
@@ -103,6 +103,36 @@ func (in *AutoscalerOptions) DeepCopy() *AutoscalerOptions {
return out
}
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ClusterUpgradeOptions) DeepCopyInto(out *ClusterUpgradeOptions) {
+ *out = *in
+ if in.MaxSurgePercent != nil {
+ in, out := &in.MaxSurgePercent, &out.MaxSurgePercent
+ *out = new(int32)
+ **out = **in
+ }
+ if in.StepSizePercent != nil {
+ in, out := &in.StepSizePercent, &out.StepSizePercent
+ *out = new(int32)
+ **out = **in
+ }
+ if in.IntervalSeconds != nil {
+ in, out := &in.IntervalSeconds, &out.IntervalSeconds
+ *out = new(int32)
+ **out = **in
+ }
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterUpgradeOptions.
+func (in *ClusterUpgradeOptions) DeepCopy() *ClusterUpgradeOptions {
+ if in == nil {
+ return nil
+ }
+ out := new(ClusterUpgradeOptions)
+ in.DeepCopyInto(out)
+ return out
+}
+
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DeletionCondition) DeepCopyInto(out *DeletionCondition) {
*out = *in
@@ -721,6 +751,20 @@ func (in *RayServiceStatus) DeepCopyInto(out *RayServiceStatus) {
(*out)[key] = *val.DeepCopy()
}
}
+ if in.TargetCapacity != nil {
+ in, out := &in.TargetCapacity, &out.TargetCapacity
+ *out = new(int32)
+ **out = **in
+ }
+ if in.TrafficRoutedPercent != nil {
+ in, out := &in.TrafficRoutedPercent, &out.TrafficRoutedPercent
+ *out = new(int32)
+ **out = **in
+ }
+ if in.LastTrafficMigratedTime != nil {
+ in, out := &in.LastTrafficMigratedTime, &out.LastTrafficMigratedTime
+ *out = (*in).DeepCopy()
+ }
in.RayClusterStatus.DeepCopyInto(&out.RayClusterStatus)
}
@@ -770,6 +814,11 @@ func (in *RayServiceUpgradeStrategy) DeepCopyInto(out *RayServiceUpgradeStrategy
*out = new(RayServiceUpgradeType)
**out = **in
}
+ if in.ClusterUpgradeOptions != nil {
+ in, out := &in.ClusterUpgradeOptions, &out.ClusterUpgradeOptions
+ *out = new(ClusterUpgradeOptions)
+ (*in).DeepCopyInto(*out)
+ }
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RayServiceUpgradeStrategy.
diff --git a/ray-operator/config/crd/bases/ray.io_rayservices.yaml b/ray-operator/config/crd/bases/ray.io_rayservices.yaml
index e2d61172a3c..267de9a20f8 100644
--- a/ray-operator/config/crd/bases/ray.io_rayservices.yaml
+++ b/ray-operator/config/crd/bases/ray.io_rayservices.yaml
@@ -8241,6 +8241,25 @@ spec:
type: integer
upgradeStrategy:
properties:
+ clusterUpgradeOptions:
+ properties:
+ gatewayClassName:
+ type: string
+ intervalSeconds:
+ format: int32
+ type: integer
+ maxSurgePercent:
+ default: 100
+ format: int32
+ type: integer
+ stepSizePercent:
+ format: int32
+ type: integer
+ required:
+ - gatewayClassName
+ - intervalSeconds
+ - stepSizePercent
+ type: object
type:
type: string
type: object
@@ -8269,6 +8288,9 @@ spec:
type: string
type: object
type: object
+ lastTrafficMigratedTime:
+ format: date-time
+ type: string
rayClusterName:
type: string
rayClusterStatus:
@@ -8383,6 +8405,12 @@ spec:
type: string
type: object
type: object
+ targetCapacity:
+ format: int32
+ type: integer
+ trafficRoutedPercent:
+ format: int32
+ type: integer
type: object
conditions:
items:
@@ -8452,6 +8480,9 @@ spec:
type: string
type: object
type: object
+ lastTrafficMigratedTime:
+ format: date-time
+ type: string
rayClusterName:
type: string
rayClusterStatus:
@@ -8566,6 +8597,12 @@ spec:
type: string
type: object
type: object
+ targetCapacity:
+ format: int32
+ type: integer
+ trafficRoutedPercent:
+ format: int32
+ type: integer
type: object
serviceStatus:
type: string
diff --git a/ray-operator/config/rbac/role.yaml b/ray-operator/config/rbac/role.yaml
index ba840f0c27f..9ea1db93190 100644
--- a/ray-operator/config/rbac/role.yaml
+++ b/ray-operator/config/rbac/role.yaml
@@ -107,6 +107,17 @@ rules:
- patch
- update
- watch
+- apiGroups:
+ - gateway.networking.k8s.io
+ resources:
+ - gateways
+ - httproutes
+ verbs:
+ - create
+ - get
+ - list
+ - update
+ - watch
- apiGroups:
- networking.k8s.io
resources:
diff --git a/ray-operator/controllers/ray/common/association.go b/ray-operator/controllers/ray/common/association.go
index 63eefa94bc4..1539e49aa88 100644
--- a/ray-operator/controllers/ray/common/association.go
+++ b/ray-operator/controllers/ray/common/association.go
@@ -203,3 +203,17 @@ func RayClusterNetworkResourcesOptions(instance *rayv1.RayCluster) AssociationOp
},
}
}
+
+func RayServiceGatewayNamespacedName(rayService *rayv1.RayService) types.NamespacedName {
+ return types.NamespacedName{
+ Name: fmt.Sprintf("%s-gateway", rayService.Name),
+ Namespace: rayService.Namespace,
+ }
+}
+
+func RayServiceHTTPRouteNamespacedName(rayService *rayv1.RayService) types.NamespacedName {
+ return types.NamespacedName{
+ Name: fmt.Sprintf("%s-httproute", rayService.Name),
+ Namespace: rayService.Namespace,
+ }
+}
diff --git a/ray-operator/controllers/ray/common/job.go b/ray-operator/controllers/ray/common/job.go
index 3cb070be168..05025a3e86e 100644
--- a/ray-operator/controllers/ray/common/job.go
+++ b/ray-operator/controllers/ray/common/job.go
@@ -91,7 +91,7 @@ func BuildJobSubmitCommand(rayJobInstance *rayv1.RayJob, submissionMode rayv1.Jo
// The sidecar submitter shares the same network namespace as the Ray dashboard,
// so it uses 127.0.0.1 to connect to the Ray dashboard.
rayHeadContainer := rayJobInstance.Spec.RayClusterSpec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex]
- port = utils.FindContainerPort(&rayHeadContainer, utils.DashboardPortName, utils.DefaultDashboardPort)
+ port = int(utils.FindContainerPort(&rayHeadContainer, utils.DashboardPortName, utils.DefaultDashboardPort))
address = "http://127.0.0.1:" + strconv.Itoa(port)
case rayv1.K8sJobMode:
// Submitter is a separate K8s Job; use cluster dashboard address.
diff --git a/ray-operator/controllers/ray/common/service.go b/ray-operator/controllers/ray/common/service.go
index 71cea97c005..545b3a6ae98 100644
--- a/ray-operator/controllers/ray/common/service.go
+++ b/ray-operator/controllers/ray/common/service.go
@@ -10,6 +10,7 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
ctrl "sigs.k8s.io/controller-runtime"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
@@ -184,7 +185,10 @@ func BuildServeService(ctx context.Context, rayService rayv1.RayService, rayClus
namespace := rayCluster.Namespace
crdType := utils.RayClusterCRD
if isRayService {
- name = rayService.Name
+ // For IncrementalUpgrade, the name is based on the unique RayCluster.
+ if !utils.IsIncrementalUpgradeEnabled(&rayService.Spec) {
+ name = rayService.Name
+ }
namespace = rayService.Namespace
crdType = utils.RayServiceCRD
}
@@ -225,7 +229,7 @@ func BuildServeService(ctx context.Context, rayService rayv1.RayService, rayClus
"otherwise, the Kubernetes service for Ray Serve will not be created.")
}
- if rayService.Spec.ServeService != nil {
+ if rayService.Spec.ServeService != nil && !utils.IsIncrementalUpgradeEnabled(&rayService.Spec) {
// Use the provided "custom" ServeService.
// Deep copy the ServeService to avoid modifying the original object
serveService := rayService.Spec.ServeService.DeepCopy()
@@ -317,6 +321,26 @@ func BuildHeadlessServiceForRayCluster(rayCluster rayv1.RayCluster) *corev1.Serv
return headlessService
}
+// GetServePort finds the container port named "serve" in the RayCluster's head group spec.
+// It returns the default Ray Serve port 8000 if not explicitly defined.
+func GetServePort(cluster *rayv1.RayCluster) gwv1.PortNumber {
+ if cluster == nil || len(cluster.Spec.HeadGroupSpec.Template.Spec.Containers) == 0 {
+ return gwv1.PortNumber(utils.DefaultServingPort)
+ }
+
+ // Get the head container
+ headContainer := &cluster.Spec.HeadGroupSpec.Template.Spec.Containers[utils.RayContainerIndex]
+
+ // Find the port named "serve" in the head group's container spec.
+ port := utils.FindContainerPort(
+ headContainer,
+ utils.ServingPortName,
+ utils.DefaultServingPort,
+ )
+
+ return gwv1.PortNumber(port)
+}
+
func setServiceTypeForUserProvidedService(ctx context.Context, service *corev1.Service, defaultType corev1.ServiceType) {
log := ctrl.LoggerFrom(ctx)
// If the user has not specified a service type, use the default service type
diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go
index 7a1a50a36f6..26ff10a6373 100644
--- a/ray-operator/controllers/ray/rayservice_controller.go
+++ b/ray-operator/controllers/ray/rayservice_controller.go
@@ -6,6 +6,7 @@ import (
"fmt"
"math"
"os"
+ "reflect"
"strconv"
"strings"
"time"
@@ -21,6 +22,7 @@ import (
"k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/client-go/tools/record"
"k8s.io/utils/lru"
+ "k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -28,6 +30,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
@@ -90,6 +93,8 @@ func NewRayServiceReconciler(_ context.Context, mgr manager.Manager, provider ut
// +kubebuilder:rbac:groups=core,resources=services/proxy,verbs=get;update;patch
// +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;create;update
// +kubebuilder:rbac:groups=core,resources=serviceaccounts,verbs=get;list;watch;create;delete
+// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=gateways,verbs=get;list;watch;create;update;
+// +kubebuilder:rbac:groups="gateway.networking.k8s.io",resources=httproutes,verbs=get;list;watch;create;update;
// +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles,verbs=get;list;watch;create;delete;update
// +kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=get;list;watch;create;delete
@@ -146,6 +151,8 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
// 1. If there is a pending cluster, reconcile serve applications for the pending cluster.
// 2. If there are both active and pending clusters, reconcile serve applications for the pending cluster only.
// 3. If there is no pending cluster, reconcile serve applications for the active cluster.
+ // 4. During NewClusterWithIncrementalUpgrade, reconcileServe will reconcile either the pending or active cluster
+ // based on total TargetCapacity.
var isActiveClusterReady, isPendingClusterReady bool = false, false
var activeClusterServeApplications, pendingClusterServeApplications map[string]rayv1.AppStatus = nil, nil
if pendingRayClusterInstance != nil {
@@ -162,6 +169,35 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil {
return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
}
+ } else if activeRayClusterInstance != nil && pendingRayClusterInstance != nil && utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) {
+ logger.Info("Reconciling the Serve applications for active cluster during NewClusterWithIncrementalUpgrade", "clusterName", activeRayClusterInstance.Name)
+ if isActiveClusterReady, activeClusterServeApplications, err = r.reconcileServe(ctx, rayServiceInstance, activeRayClusterInstance); err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
+ }
+ }
+
+ // Check if NewClusterWithIncrementalUpgrade is enabled, if so reconcile Gateway objects.
+ var httpRouteInstance *gwv1.HTTPRoute
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) {
+ // Ensure per-cluster Serve service exists for the active and pending RayClusters.
+ if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, activeRayClusterInstance); err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
+ }
+ if err = r.reconcilePerClusterServeService(ctx, rayServiceInstance, pendingRayClusterInstance); err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
+ }
+ // Creates or updates a Gateway CR that points to the Serve services of
+ // the active and pending (if it exists) RayClusters. For incremental upgrades,
+ // the Gateway endpoint is used rather than the Serve service.
+ err = r.reconcileGateway(ctx, rayServiceInstance)
+ if err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err)
+ }
+ // Create or update the HTTPRoute for the Gateway, passing in the pending cluster readiness status.
+ httpRouteInstance, err = r.reconcileHTTPRoute(ctx, rayServiceInstance, isPendingClusterReady)
+ if err != nil {
+ return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, client.IgnoreNotFound(err)
+ }
}
// Reconcile K8s services and make sure it points to the correct RayCluster.
@@ -170,7 +206,10 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
targetCluster := activeRayClusterInstance
logMsg := "Reconciling K8s services to point to the active Ray cluster."
- if isPendingClusterReady {
+ isIncrementalUpgradeInProgress := utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress))
+ if isPendingClusterReady && !isIncrementalUpgradeInProgress {
+ // This step is skipped for incremental upgrade, because the pending cluster is ready during the upgrade
+ // and creates its own per-cluster Serve service.
targetCluster = pendingRayClusterInstance
logMsg = "Reconciling K8s services to point to the pending Ray cluster to switch traffic because it is ready."
}
@@ -197,6 +236,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
pendingRayClusterInstance,
activeClusterServeApplications,
pendingClusterServeApplications,
+ httpRouteInstance,
); err != nil {
return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
}
@@ -224,7 +264,70 @@ func (r *RayServiceReconciler) reconcileServicesToReadyCluster(ctx context.Conte
return headSvc, serveSvc, nil
}
-func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceInstance *rayv1.RayService, headSvc, serveSvc *corev1.Service, activeCluster, pendingCluster *rayv1.RayCluster, activeClusterServeApplications, pendingClusterServeApplications map[string]rayv1.AppStatus) error {
+// reconcilePromotionAndServingStatus handles the promotion logic after an upgrade, returning
+// isPendingClusterServing: True if the main Kubernetes services are pointing to the pending cluster.
+func reconcilePromotionAndServingStatus(ctx context.Context, headSvc, serveSvc *corev1.Service, rayServiceInstance *rayv1.RayService, pendingCluster *rayv1.RayCluster) (isPendingClusterServing bool) {
+ logger := ctrl.LoggerFrom(ctx)
+
+ // Step 1: Service Consistency Check. Ensure head and serve services point to the
+ // same cluster (active or pending).
+ clusterSvcPointsTo := utils.GetRayClusterNameFromService(headSvc)
+ if clusterSvcPointsTo != utils.GetRayClusterNameFromService(serveSvc) {
+ // This indicates a broken state that the controller cannot recover from automatically.
+ panic("headSvc and serveSvc are not pointing to the same cluster")
+ }
+
+ // Step 2: Cluster Switching Logic. Determine which cluster the services are currently pointing to and
+ // determine if promotion should occur.
+ pendingClusterName := rayServiceInstance.Status.PendingServiceStatus.RayClusterName
+ activeClusterName := rayServiceInstance.Status.ActiveServiceStatus.RayClusterName
+
+ // Verify that the service points to a known cluster (either active or pending).
+ if clusterSvcPointsTo != pendingClusterName && clusterSvcPointsTo != activeClusterName {
+ panic("clusterName from services is not equal to pendingCluster or activeCluster")
+ }
+
+ var shouldPromote bool
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) {
+ // An incremental upgrade is complete when the active cluster has 0% capacity and the pending cluster has
+ // 100% of the traffic. We can't promote the pending cluster until traffic has been fully migrated.
+ if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) {
+ if utils.IsIncrementalUpgradeComplete(rayServiceInstance, pendingCluster) {
+ shouldPromote = true
+ logger.Info("Incremental upgrade completed, triggering promotion.", "rayService", rayServiceInstance.Name)
+ }
+ } else if activeClusterName == "" && pendingClusterName != "" {
+ // The Active cluster is empty when the RayCluster is first scaling up.
+ shouldPromote = true
+ }
+ } else {
+ // For traditional blue/green upgrade, promotion is complete when the Service selector has switched.
+ if activeClusterName != clusterSvcPointsTo {
+ shouldPromote = true
+ }
+ }
+
+ // Step 3: Promote the pending cluster if prior conditions are met.
+ if shouldPromote {
+ logger.Info("Promoting pending cluster to active.",
+ "oldCluster", rayServiceInstance.Status.ActiveServiceStatus.RayClusterName,
+ "newCluster", rayServiceInstance.Status.PendingServiceStatus.RayClusterName)
+
+ rayServiceInstance.Status.ActiveServiceStatus = rayServiceInstance.Status.PendingServiceStatus
+ rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{}
+ }
+
+ return (clusterSvcPointsTo == pendingClusterName)
+}
+
+func (r *RayServiceReconciler) calculateStatus(
+ ctx context.Context,
+ rayServiceInstance *rayv1.RayService,
+ headSvc, serveSvc *corev1.Service,
+ activeCluster, pendingCluster *rayv1.RayCluster,
+ activeClusterServeApplications, pendingClusterServeApplications map[string]rayv1.AppStatus,
+ httpRoute *gwv1.HTTPRoute,
+) error {
logger := ctrl.LoggerFrom(ctx)
rayServiceInstance.Status.ObservedGeneration = rayServiceInstance.ObjectMeta.Generation
@@ -244,32 +347,35 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn
rayServiceInstance.Status.ActiveServiceStatus.Applications = activeClusterServeApplications
rayServiceInstance.Status.PendingServiceStatus.Applications = pendingClusterServeApplications
- isPendingClusterServing := false
+ var isPendingClusterServing bool
if headSvc != nil && serveSvc != nil {
- pendingClusterName := rayServiceInstance.Status.PendingServiceStatus.RayClusterName
- activeClusterName := rayServiceInstance.Status.ActiveServiceStatus.RayClusterName
-
- // Promote the pending cluster to the active cluster if both RayService's head and serve services
- // have already pointed to the pending cluster.
- clusterName := utils.GetRayClusterNameFromService(headSvc)
- if clusterName != utils.GetRayClusterNameFromService(serveSvc) {
- panic("headSvc and serveSvc are not pointing to the same cluster")
- }
- // Verify cluster name matches either pending or active cluster
- if clusterName != pendingClusterName && clusterName != activeClusterName {
- panic("clusterName is not equal to pendingCluster or activeCluster")
- }
- isPendingClusterServing = clusterName == pendingClusterName
-
- // If services point to a different cluster than the active one, promote pending to active
- logger.Info("calculateStatus", "clusterSvcPointingTo", clusterName, "pendingClusterName", pendingClusterName, "activeClusterName", activeClusterName)
- if activeClusterName != clusterName {
- logger.Info("Promoting pending cluster to active",
- "oldCluster", rayServiceInstance.Status.ActiveServiceStatus.RayClusterName,
- "newCluster", clusterName)
- rayServiceInstance.Status.ActiveServiceStatus = rayServiceInstance.Status.PendingServiceStatus
- rayServiceInstance.Status.PendingServiceStatus = rayv1.RayServiceStatus{}
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) {
+ logger.Info("Processing NewClusterWithIncrementalUpgrade strategy.", "rayService", rayServiceInstance.Name)
+ oldActivePercent := ptr.Deref(rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent, -1)
+ oldPendingPercent := ptr.Deref(rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent, -1)
+
+ // Update TrafficRoutedPercent to each RayService based on current weights from HTTPRoute.
+ activeWeight, pendingWeight := utils.GetWeightsFromHTTPRoute(httpRoute, rayServiceInstance)
+ now := metav1.Time{Time: time.Now()}
+ if activeWeight >= 0 {
+ rayServiceInstance.Status.ActiveServiceStatus.TrafficRoutedPercent = ptr.To(activeWeight)
+ logger.Info("Updated active TrafficRoutedPercent from HTTPRoute", "activeClusterWeight", activeWeight)
+ if activeWeight != oldActivePercent {
+ rayServiceInstance.Status.ActiveServiceStatus.LastTrafficMigratedTime = &now
+ logger.Info("Updated LastTrafficMigratedTime of Active Service.")
+ }
+ }
+ if pendingWeight >= 0 {
+ rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent = ptr.To(pendingWeight)
+ logger.Info("Updated pending TrafficRoutedPercent from HTTPRoute", "pendingClusterWeight", pendingWeight)
+ if pendingWeight != oldPendingPercent {
+ rayServiceInstance.Status.PendingServiceStatus.LastTrafficMigratedTime = &now
+ logger.Info("Updated LastTrafficMigratedTime of Pending Service.")
+ }
+ }
}
+ // Reconcile serving status and promotion logic for all upgrade strategies.
+ isPendingClusterServing = reconcilePromotionAndServingStatus(ctx, headSvc, serveSvc, rayServiceInstance, pendingCluster)
}
if shouldPrepareNewCluster(ctx, rayServiceInstance, activeCluster, pendingCluster, isPendingClusterServing) {
@@ -278,10 +384,34 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn
}
logger.Info("Preparing a new pending RayCluster instance by setting RayClusterName",
"clusterName", rayServiceInstance.Status.PendingServiceStatus.RayClusterName)
+
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) {
+ // Set IncrementalUpgrade related Status fields for new pending RayCluster if enabled.
+ if rayServiceInstance.Status.ActiveServiceStatus.RayClusterName == "" {
+ // If no Active RayCluster exists - default to starting with 100% TargetCapacity.
+ // This is the case when a RayCluster is first starting for a RayService, so we should
+ // immediately scale it to full target capacity.
+ if rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity == nil {
+ rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(100))
+ }
+ } else if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress)) {
+ // Pending RayCluster during an upgrade should start with 0% TargetCapacity, since
+ // traffic will be gradually migrated to the new cluster.
+ if rayServiceInstance.Status.PendingServiceStatus.TargetCapacity == nil {
+ rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(0))
+ }
+ }
+ }
}
serveEndPoints := &corev1.Endpoints{}
- if err := r.Get(ctx, common.RayServiceServeServiceNamespacedName(rayServiceInstance), serveEndPoints); err != nil && !errors.IsNotFound(err) {
+ serveServiceName := common.RayServiceServeServiceNamespacedName(rayServiceInstance)
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && activeCluster != nil {
+ // The Serve service name is based on the unique RayCluster name, since we use the
+ // per-cluster Serve services for traffic routing during an incremental upgrade.
+ serveServiceName.Name = utils.GenerateServeServiceName(activeCluster.Name)
+ }
+ if err := r.Get(ctx, serveServiceName, serveEndPoints); err != nil && !errors.IsNotFound(err) {
return err
}
@@ -291,9 +421,22 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn
for _, subset := range serveEndPoints.Subsets {
numServeEndpoints += len(subset.Addresses)
}
+
+ // During NewClusterWithIncrementalUpgrade, the pending RayCluster is also serving.
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) && pendingCluster != nil {
+ pendingServeServiceName := common.RayClusterServeServiceNamespacedName(pendingCluster)
+ if err := r.Get(ctx, pendingServeServiceName, serveEndPoints); err != nil && !errors.IsNotFound(err) {
+ return err
+ }
+ for _, subset := range serveEndPoints.Subsets {
+ numServeEndpoints += len(subset.Addresses)
+ }
+ }
+
if numServeEndpoints > math.MaxInt32 {
return errstd.New("numServeEndpoints exceeds math.MaxInt32")
}
+
rayServiceInstance.Status.NumServeEndpoints = int32(numServeEndpoints) //nolint:gosec // This is a false positive from gosec. See https://github.com/securego/gosec/issues/1212 for more details.
calculateConditions(rayServiceInstance)
@@ -302,6 +445,7 @@ func (r *RayServiceReconciler) calculateStatus(ctx context.Context, rayServiceIn
if meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.RayServiceReady)) {
rayServiceInstance.Status.ServiceStatus = rayv1.Running
}
+
return nil
}
@@ -392,7 +536,12 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra
if upgradeStrategy != nil {
upgradeType := upgradeStrategy.Type
if upgradeType != nil {
- if *upgradeType != rayv1.NewCluster {
+ if features.Enabled(features.RayServiceIncrementalUpgrade) {
+ if *upgradeType != rayv1.NewCluster && *upgradeType != rayv1.NewClusterWithIncrementalUpgrade {
+ logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to %s or %s.", string(rayv1.NewCluster), string(rayv1.NewClusterWithIncrementalUpgrade))
+ return false
+ }
+ } else if *upgradeType != rayv1.NewCluster {
logger.Info("Zero-downtime upgrade is disabled because UpgradeStrategy.Type is not set to NewCluster.")
return false
}
@@ -407,6 +556,286 @@ func isZeroDowntimeUpgradeEnabled(ctx context.Context, upgradeStrategy *rayv1.Ra
return true
}
+// `createGateway` creates a Gateway for a RayService or updates an existing Gateway.
+func (r *RayServiceReconciler) createGateway(rayServiceInstance *rayv1.RayService) (*gwv1.Gateway, error) {
+ options := utils.GetRayServiceClusterUpgradeOptions(&rayServiceInstance.Spec)
+ if options == nil {
+ return nil, errstd.New("Missing RayService ClusterUpgradeOptions during upgrade.")
+ }
+
+ gatewayName := rayServiceInstance.Name + "-gateway"
+ // Define the desired Gateway object
+ rayServiceGateway := &gwv1.Gateway{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: gatewayName,
+ Namespace: rayServiceInstance.Namespace,
+ },
+ Spec: gwv1.GatewaySpec{
+ GatewayClassName: gwv1.ObjectName(options.GatewayClassName),
+ Listeners: []gwv1.Listener{
+ {
+ Name: gwv1.SectionName(utils.GatewayListenerPortName),
+ Protocol: gwv1.HTTPProtocolType,
+ Port: utils.DefaultGatewayListenerPort,
+ },
+ },
+ },
+ }
+
+ return rayServiceGateway, nil
+}
+
+// `reconcileGateway` reconciles a Gateway resource for a RayService. The possible cases are:
+// (1) Create a new Gateway instance. (2) Update the Gateway instance if RayService has updated. (3) Do nothing.
+func (r *RayServiceReconciler) reconcileGateway(ctx context.Context, rayServiceInstance *rayv1.RayService) error {
+ logger := ctrl.LoggerFrom(ctx)
+ var err error
+
+ // Construct desired Gateway object for RayService
+ desiredGateway, err := r.createGateway(rayServiceInstance)
+ if err != nil {
+ logger.Error(err, "Failed to build Gateway object for Rayservice")
+ return err
+ }
+ if desiredGateway == nil {
+ logger.Info("Skipping Gateway reconciliation: desired Gateway is nil")
+ return nil
+ }
+
+ // Check for existing RayService Gateway, create the desired Gateway if none is found
+ existingGateway := &gwv1.Gateway{}
+ if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), existingGateway); err != nil {
+ if errors.IsNotFound(err) {
+ // Set the ownership in order to do the garbage collection by k8s.
+ if err := ctrl.SetControllerReference(rayServiceInstance, desiredGateway, r.Scheme); err != nil {
+ return err
+ }
+ logger.Info("Creating a new Gateway instance", "Gateway Listeners", desiredGateway.Spec.Listeners)
+ if err := r.Create(ctx, desiredGateway); err != nil {
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateGateway), "Failed to create Gateway for RayService %s/%s: %v", desiredGateway.Namespace, desiredGateway.Name, err)
+ return err
+ }
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.CreatedGateway), "Created Gateway for RayService %s/%s", desiredGateway.Namespace, desiredGateway.Name)
+ return nil
+ }
+ return err
+ }
+
+ // If Gateway already exists, check if update is needed to reach desired state
+ if !reflect.DeepEqual(existingGateway.Spec, desiredGateway.Spec) {
+ logger.Info("Updating existing Gateway", "name", existingGateway.Name)
+ existingGateway.Spec = desiredGateway.Spec
+ if err := r.Update(ctx, existingGateway); err != nil {
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateGateway), "Failed to update the Gateway %s/%s: %v", existingGateway.Namespace, existingGateway.Name, err)
+ return err
+ }
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedGateway), "Updated the Gateway %s/%s", existingGateway.Namespace, existingGateway.Name)
+ }
+
+ return nil
+}
+
+// calculateTrafficRoutedPercent determines the HTTPRoute traffic split between the active and pending RayClusters.
+//
+// The new weights are calculated using:
+// - Current TrafficRoutedPercent values
+// - Time-based migration using StepSizePercent and IntervalSeconds
+// - TargetCapacity constraints
+//
+// Returns the active cluster traffic weight, pending cluster traffic weight, and an error if any.
+func (r *RayServiceReconciler) calculateTrafficRoutedPercent(ctx context.Context, rayServiceInstance *rayv1.RayService, isPendingClusterReady bool) (activeClusterWeight, pendingClusterWeight int32, err error) {
+ logger := ctrl.LoggerFrom(ctx)
+ activeServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus
+ pendingServiceStatus := &rayServiceInstance.Status.PendingServiceStatus
+
+ // Default to 100% traffic on the active cluster.
+ activeClusterWeight = ptr.Deref(activeServiceStatus.TrafficRoutedPercent, 100)
+ pendingClusterWeight = ptr.Deref(pendingServiceStatus.TrafficRoutedPercent, 0)
+
+ if isPendingClusterReady {
+ // Zero-downtime upgrade in progress.
+ options := utils.GetRayServiceClusterUpgradeOptions(&rayServiceInstance.Spec)
+ if options == nil {
+ return 0, 0, errstd.New("ClusterUpgradeOptions are not set during upgrade.")
+ }
+
+ // Check that target_capacity has been updated before migrating traffic.
+ pendingClusterTargetCapacity := ptr.Deref(pendingServiceStatus.TargetCapacity, 0)
+
+ if pendingClusterWeight == pendingClusterTargetCapacity {
+ // Stop traffic migration because the pending cluster's current traffic weight has reached its target capacity limit.
+ return activeClusterWeight, pendingClusterWeight, nil
+ }
+
+ // If IntervalSeconds has passed since LastTrafficMigratedTime, migrate StepSizePercent traffic
+ // from the active RayCluster to the pending RayCluster.
+ intervalSeconds := time.Duration(*options.IntervalSeconds) * time.Second
+ lastTrafficMigratedTime := pendingServiceStatus.LastTrafficMigratedTime
+ if lastTrafficMigratedTime == nil || time.Since(lastTrafficMigratedTime.Time) >= intervalSeconds {
+ // Gradually shift traffic from the active to the pending cluster.
+ logger.Info("Upgrade in progress. Migrating traffic by StepSizePercent.", "stepSize", *options.StepSizePercent)
+ proposedPendingWeight := pendingClusterWeight + *options.StepSizePercent
+ pendingClusterWeight = min(100, proposedPendingWeight, pendingClusterTargetCapacity)
+ activeClusterWeight = 100 - pendingClusterWeight
+ }
+ }
+
+ return activeClusterWeight, pendingClusterWeight, nil
+}
+
+// createHTTPRoute creates a desired HTTPRoute object for RayService incremental upgrade.
+//
+// The function performs the following operations:
+// 1. Retrieves Gateway instance to attach the HTTPRoute
+// 2. Gets active and pending RayCluster instances and their Serve services
+// 3. Calls `calculateTrafficRoutedPercent` to calculate the new traffic weights
+// 4. Configures HTTPRoute with appropriate backend references and weights
+//
+// Returns the configured HTTPRoute object or error if any step fails.
+func (r *RayServiceReconciler) createHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService, isPendingClusterReady bool) (*gwv1.HTTPRoute, error) {
+ logger := ctrl.LoggerFrom(ctx)
+
+ // Retrieve Gateway instance to attach this HTTPRoute to.
+ gatewayInstance := &gwv1.Gateway{}
+ if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil {
+ return nil, err
+ }
+
+ // Retrieve the active RayCluster
+ activeRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServiceActiveRayClusterNamespacedName(rayServiceInstance))
+ if err != nil && !errors.IsNotFound(err) {
+ logger.Error(err, "Failed to retrieve active RayCluster")
+ return nil, err
+ }
+ if activeRayCluster == nil {
+ logger.Info("Active RayCluster not found, skipping HTTPRoute creation.")
+ return nil, nil
+ }
+
+ // Attempt to retrieve pending RayCluster
+ pendingRayCluster, err := r.getRayClusterByNamespacedName(ctx, common.RayServicePendingRayClusterNamespacedName(rayServiceInstance))
+ if err != nil && !errors.IsNotFound(err) {
+ logger.Error(err, "Failed to retrieve pending RayCluster.")
+ return nil, err
+ }
+
+ activeClusterWeight, pendingClusterWeight, err := r.calculateTrafficRoutedPercent(ctx, rayServiceInstance, isPendingClusterReady)
+ if err != nil {
+ logger.Info("Failed to reconcile TrafficRoutedPercent for active and pending clusters.")
+ return nil, err
+ }
+
+ activeClusterServeSvcName := utils.GenerateServeServiceName(activeRayCluster.Name)
+ activeServePort := common.GetServePort(activeRayCluster)
+
+ backendRefs := []gwv1.HTTPBackendRef{
+ {
+ BackendRef: gwv1.BackendRef{
+ BackendObjectReference: gwv1.BackendObjectReference{
+ Name: gwv1.ObjectName(activeClusterServeSvcName),
+ Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)),
+ Port: ptr.To(activeServePort),
+ },
+ Weight: ptr.To(activeClusterWeight),
+ },
+ },
+ }
+
+ if pendingRayCluster != nil {
+ logger.Info("Pending RayCluster exists. Including it in HTTPRoute.", "RayCluster", pendingRayCluster.Name)
+ pendingClusterServeSvcName := utils.GenerateServeServiceName(pendingRayCluster.Name)
+ pendingServePort := common.GetServePort(pendingRayCluster)
+
+ backendRefs = append(backendRefs, gwv1.HTTPBackendRef{
+ BackendRef: gwv1.BackendRef{
+ BackendObjectReference: gwv1.BackendObjectReference{
+ Name: gwv1.ObjectName(pendingClusterServeSvcName),
+ Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)),
+ Port: ptr.To(pendingServePort),
+ },
+ Weight: ptr.To(pendingClusterWeight),
+ },
+ })
+ }
+
+ httpRouteName := rayServiceInstance.Name + "-httproute"
+ desiredHTTPRoute := &gwv1.HTTPRoute{
+ ObjectMeta: metav1.ObjectMeta{Name: httpRouteName, Namespace: gatewayInstance.Namespace},
+ Spec: gwv1.HTTPRouteSpec{
+ CommonRouteSpec: gwv1.CommonRouteSpec{
+ ParentRefs: []gwv1.ParentReference{
+ {
+ Name: gwv1.ObjectName(gatewayInstance.Name),
+ Namespace: ptr.To(gwv1.Namespace(gatewayInstance.Namespace)),
+ },
+ },
+ },
+ Rules: []gwv1.HTTPRouteRule{
+ {
+ Matches: []gwv1.HTTPRouteMatch{
+ {
+ Path: &gwv1.HTTPPathMatch{
+ Type: ptr.To(gwv1.PathMatchPathPrefix),
+ Value: ptr.To("/"),
+ },
+ },
+ },
+ BackendRefs: backendRefs,
+ },
+ },
+ },
+ }
+
+ return desiredHTTPRoute, nil
+}
+
+// reconcileHTTPRoute reconciles a HTTPRoute resource for a RayService to route traffic during a NewClusterWithIncrementalUpgrade.
+func (r *RayServiceReconciler) reconcileHTTPRoute(ctx context.Context, rayServiceInstance *rayv1.RayService, isPendingClusterReady bool) (*gwv1.HTTPRoute, error) {
+ logger := ctrl.LoggerFrom(ctx)
+ var err error
+
+ desiredHTTPRoute, err := r.createHTTPRoute(ctx, rayServiceInstance, isPendingClusterReady)
+ if err != nil {
+ logger.Error(err, "Failed to build HTTPRoute for RayService upgrade")
+ return nil, err
+ }
+ if desiredHTTPRoute == nil {
+ logger.Info("Skipping HTTPRoute reconciliation: desired HTTPRoute is nil")
+ return nil, nil
+ }
+
+ // Check for existing HTTPRoute for RayService
+ existingHTTPRoute := &gwv1.HTTPRoute{}
+ if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), existingHTTPRoute); err != nil {
+ if errors.IsNotFound(err) {
+ // Set the ownership in order to do the garbage collection by k8s.
+ if err := ctrl.SetControllerReference(rayServiceInstance, desiredHTTPRoute, r.Scheme); err != nil {
+ return nil, err
+ }
+ if err = r.Create(ctx, desiredHTTPRoute); err != nil {
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateHTTPRoute), "Failed to create the HTTPRoute for RayService %s/%s: %v", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name, err)
+ return nil, err
+ }
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.CreatedHTTPRoute), "Created HTTPRoute for RayService %s/%s", desiredHTTPRoute.Namespace, desiredHTTPRoute.Name)
+ return desiredHTTPRoute, nil
+ }
+ return nil, err
+ }
+
+ // If HTTPRoute already exists, check if update is needed
+ if !reflect.DeepEqual(existingHTTPRoute.Spec, desiredHTTPRoute.Spec) {
+ logger.Info("Updating existing HTTPRoute", "name", desiredHTTPRoute.Name)
+ existingHTTPRoute.Spec = desiredHTTPRoute.Spec
+ if err := r.Update(ctx, existingHTTPRoute); err != nil {
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateHTTPRoute), "Failed to update the HTTPRoute %s/%s: %v", existingHTTPRoute.Namespace, existingHTTPRoute.Name, err)
+ return nil, err
+ }
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedHTTPRoute), "Updated the HTTPRoute %s/%s", existingHTTPRoute.Namespace, existingHTTPRoute.Name)
+ }
+
+ return existingHTTPRoute, nil
+}
+
// `reconcileRayCluster` reconciles the active and pending Ray clusters. There are 4 possible cases:
// (1) Create a new pending cluster. (2) Update the active cluster. (3) Update the pending cluster. (4) Do nothing.
func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServiceInstance *rayv1.RayService) (*rayv1.RayCluster, *rayv1.RayCluster, error) {
@@ -700,6 +1129,17 @@ func constructRayClusterForRayService(rayService *rayv1.RayService, rayClusterNa
// set the KubeRay version used to create the RayCluster
rayClusterAnnotations[utils.KubeRayVersion] = utils.KUBERAY_VERSION
+ clusterSpec := rayService.Spec.RayClusterSpec.DeepCopy()
+ isPendingClusterForUpgrade := utils.IsIncrementalUpgradeEnabled(&rayService.Spec) &&
+ rayService.Status.ActiveServiceStatus.RayClusterName != ""
+ if isPendingClusterForUpgrade {
+ // For incremental upgrade, start the pending cluster without a replicas value so
+ // that it autoscales based on the value of target_capacity from MinReplicas.
+ for i := range clusterSpec.WorkerGroupSpecs {
+ clusterSpec.WorkerGroupSpecs[i].Replicas = nil
+ }
+ }
+
rayCluster := &rayv1.RayCluster{
ObjectMeta: metav1.ObjectMeta{
Labels: rayClusterLabel,
@@ -707,7 +1147,7 @@ func constructRayClusterForRayService(rayService *rayv1.RayService, rayClusterNa
Name: rayClusterName,
Namespace: rayService.Namespace,
},
- Spec: rayService.Spec.RayClusterSpec,
+ Spec: *clusterSpec,
}
// Set the ownership in order to do the garbage collection by k8s.
@@ -748,6 +1188,24 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer
return err
}
+ if utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) {
+ // For incremental upgrades, set target_capacity if specified to avoid
+ // scaling initial Serve deployment to 100% immediately.
+ var targetCapacity *int32
+ activeStatus := rayServiceInstance.Status.ActiveServiceStatus
+ pendingStatus := rayServiceInstance.Status.PendingServiceStatus
+
+ if clusterName == activeStatus.RayClusterName && activeStatus.TargetCapacity != nil {
+ targetCapacity = activeStatus.TargetCapacity
+ } else if clusterName == pendingStatus.RayClusterName && pendingStatus.TargetCapacity != nil {
+ targetCapacity = pendingStatus.TargetCapacity
+ }
+ if targetCapacity != nil {
+ logger.Info("Setting target_capacity from status in Serve config.", "target_capacity", *targetCapacity)
+ serveConfig["target_capacity"] = *targetCapacity
+ }
+ }
+
configJson, err := json.Marshal(serveConfig)
if err != nil {
return fmt.Errorf("failed to marshal converted serve config into bytes: %w", err)
@@ -767,6 +1225,194 @@ func (r *RayServiceReconciler) updateServeDeployment(ctx context.Context, raySer
return nil
}
+// checkIfNeedTargetCapacityUpdate returns whether the controller should adjust the target_capacity
+// of the Serve config associated with a RayCluster during NewClusterWithIncrementalUpgrade.
+//
+// This function implements the incremental upgrade state machine as defined in the design document:
+// https://github.com/ray-project/enhancements/blob/main/reps/2024-12-4-ray-service-incr-upgrade.md
+//
+// The upgrade process follows these phases:
+// 1. Phase 1 (Steps 7-8): New cluster scales up to target capacity
+// - pendingTargetCapacity: 0% → 100%
+// - Returns true: "Pending RayCluster has not finished scaling up."
+//
+// 2. Phase 2 (Step 9): Traffic gradually migrates to new cluster
+// - pendingTrafficRoutedPercent: 0% → 100%
+// - Returns true: "Pending RayCluster has not finished scaling up."
+//
+// 3. Phase 3 (Step 10): Old cluster scales down after new cluster is ready
+// - activeTargetCapacity: 100% → 0%
+// - Returns true: "Active RayCluster TargetCapacity has not finished scaling down."
+//
+// 4. Phase 4 (Step 11): Upgrade completion
+// - Both clusters reach final state: active=0%, pending=100%
+// - Returns false: "All traffic has migrated to the upgraded cluster and NewClusterWithIncrementalUpgrade migration
+// is complete."
+//
+// The function ensures that traffic migration only proceeds when the target cluster has reached
+// its capacity limit, preventing resource conflicts and ensuring upgrade stability.
+func (r *RayServiceReconciler) checkIfNeedTargetCapacityUpdate(ctx context.Context, rayServiceInstance *rayv1.RayService) (bool, string) {
+ activeRayServiceStatus := rayServiceInstance.Status.ActiveServiceStatus
+ pendingRayServiceStatus := rayServiceInstance.Status.PendingServiceStatus
+
+ if activeRayServiceStatus.RayClusterName == "" || pendingRayServiceStatus.RayClusterName == "" {
+ return false, "Both active and pending RayCluster instances are required for NewClusterWithIncrementalUpgrade."
+ }
+
+ // Validate Gateway and HTTPRoute objects are ready
+ gatewayInstance := &gwv1.Gateway{}
+ if err := r.Get(ctx, common.RayServiceGatewayNamespacedName(rayServiceInstance), gatewayInstance); err != nil {
+ return false, fmt.Sprintf("Failed to retrieve Gateway for RayService: %v", err)
+ }
+ if !utils.IsGatewayReady(gatewayInstance) {
+ return false, "Gateway for RayService NewClusterWithIncrementalUpgrade is not ready."
+ }
+
+ httpRouteInstance := &gwv1.HTTPRoute{}
+ if err := r.Get(ctx, common.RayServiceHTTPRouteNamespacedName(rayServiceInstance), httpRouteInstance); err != nil {
+ return false, fmt.Sprintf("Failed to retrieve HTTPRoute for RayService: %v", err)
+ }
+ if !utils.IsHTTPRouteReady(gatewayInstance, httpRouteInstance) {
+ return false, "HTTPRoute for RayService NewClusterWithIncrementalUpgrade is not ready."
+ }
+
+ // Retrieve the current observed NewClusterWithIncrementalUpgrade Status fields for each RayService.
+ if activeRayServiceStatus.TargetCapacity == nil || activeRayServiceStatus.TrafficRoutedPercent == nil {
+ return true, "Active RayServiceStatus missing TargetCapacity or TrafficRoutedPercent."
+ }
+ if pendingRayServiceStatus.TargetCapacity == nil || pendingRayServiceStatus.TrafficRoutedPercent == nil {
+ return true, "Pending RayServiceStatus missing TargetCapacity or TrafficRoutedPercent."
+ }
+ activeTargetCapacity := int(*activeRayServiceStatus.TargetCapacity)
+ pendingTargetCapacity := int(*pendingRayServiceStatus.TargetCapacity)
+ pendingTrafficRoutedPercent := int(*pendingRayServiceStatus.TrafficRoutedPercent)
+
+ if activeTargetCapacity == 0 && pendingTargetCapacity == 100 {
+ return false, "All traffic has migrated to the upgraded cluster and NewClusterWithIncrementalUpgrade is complete."
+ } else if pendingTargetCapacity < 100 || pendingTrafficRoutedPercent < 100 {
+ return true, "Pending RayCluster has not finished scaling up."
+ }
+ return true, "Active RayCluster TargetCapacity has not finished scaling down."
+}
+
+// applyServeTargetCapacity updates the target_capacity for a given RayCluster's Serve applications.
+func (r *RayServiceReconciler) applyServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient dashboardclient.RayDashboardClientInterface, goalTargetCapacity int32) error {
+ logger := ctrl.LoggerFrom(ctx).WithValues("RayCluster", rayClusterInstance.Name)
+
+ // Retrieve cached ServeConfig from last reconciliation for cluster to update
+ cachedConfig := r.getServeConfigFromCache(rayServiceInstance, rayClusterInstance.Name)
+ if cachedConfig == "" {
+ cachedConfig = rayServiceInstance.Spec.ServeConfigV2
+ }
+
+ serveConfig := make(map[string]interface{})
+ if err := yaml.Unmarshal([]byte(cachedConfig), &serveConfig); err != nil {
+ return err
+ }
+
+ // Check if ServeConfig requires update
+ if currentTargetCapacity, ok := serveConfig["target_capacity"].(float64); ok {
+ if int32(currentTargetCapacity) == goalTargetCapacity {
+ logger.Info("target_capacity already updated on RayCluster", "target_capacity", currentTargetCapacity)
+ // No update required, return early
+ return nil
+ }
+ }
+
+ serveConfig["target_capacity"] = goalTargetCapacity
+ configJson, err := json.Marshal(serveConfig)
+ if err != nil {
+ return fmt.Errorf("failed to marshal serve config: %w", err)
+ }
+
+ logger.Info("Applying new target_capacity to Ray cluster.", "goal", goalTargetCapacity)
+ if err := rayDashboardClient.UpdateDeployments(ctx, configJson); err != nil {
+ err = fmt.Errorf(
+ "fail to create / update Serve applications. If you observe this error consistently, "+
+ "please check \"Issue 5: Fail to create / update Serve applications.\" in "+
+ "https://docs.ray.io/en/master/cluster/kubernetes/troubleshooting/rayservice-troubleshooting.html#kuberay-raysvc-troubleshoot for more details. "+
+ "err: %v", err)
+ return err
+ }
+
+ // Update the status fields and cache new Serve config.
+ if rayClusterInstance.Name == rayServiceInstance.Status.ActiveServiceStatus.RayClusterName {
+ rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity)
+ } else if rayClusterInstance.Name == rayServiceInstance.Status.PendingServiceStatus.RayClusterName {
+ rayServiceInstance.Status.PendingServiceStatus.TargetCapacity = ptr.To(goalTargetCapacity)
+ }
+ r.cacheServeConfig(rayServiceInstance, rayClusterInstance.Name)
+
+ return nil
+}
+
+// reconcileServeTargetCapacity reconciles the target_capacity of the ServeConfig for a given RayCluster during
+// a NewClusterWithIncrementalUpgrade while also updating the Status.TargetCapacity of the Active and Pending RayServices.
+func (r *RayServiceReconciler) reconcileServeTargetCapacity(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, rayDashboardClient dashboardclient.RayDashboardClientInterface) error {
+ logger := ctrl.LoggerFrom(ctx)
+ logger.Info("reconcileServeTargetCapacity", "RayService", rayServiceInstance.Name)
+
+ activeRayServiceStatus := &rayServiceInstance.Status.ActiveServiceStatus
+ pendingRayServiceStatus := &rayServiceInstance.Status.PendingServiceStatus
+
+ // Set initial TargetCapacity values if unset
+ if activeRayServiceStatus.TargetCapacity == nil {
+ activeRayServiceStatus.TargetCapacity = ptr.To(int32(100))
+ }
+ if pendingRayServiceStatus.TargetCapacity == nil {
+ pendingRayServiceStatus.TargetCapacity = ptr.To(int32(0))
+ }
+
+ // Retrieve the current observed Status fields for NewClusterWithIncrementalUpgrade
+ activeTargetCapacity := *activeRayServiceStatus.TargetCapacity
+ pendingTargetCapacity := *pendingRayServiceStatus.TargetCapacity
+ pendingTrafficRoutedPercent := ptr.Deref(pendingRayServiceStatus.TrafficRoutedPercent, 0)
+
+ // Retrieve MaxSurgePercent - the maximum amount to change TargetCapacity by
+ options := utils.GetRayServiceClusterUpgradeOptions(&rayServiceInstance.Spec)
+ if options == nil {
+ return errstd.New("Missing RayService ClusterUpgradeOptions during upgrade")
+ }
+ maxSurgePercent := ptr.Deref(options.MaxSurgePercent, 100)
+
+ // Defer updating the target_capacity until traffic weights are updated
+ if pendingTargetCapacity != pendingTrafficRoutedPercent {
+ logger.Info("Traffic is currently being migrated to pending cluster", "RayCluster", pendingRayServiceStatus.RayClusterName, "TargetCapacity", pendingTargetCapacity, "TrafficRoutedPercent", pendingTrafficRoutedPercent)
+ return nil
+ }
+
+ // There are two cases:
+ // 1. The total target_capacity is greater than 100. This means the pending RayCluster has
+ // scaled up traffic and the active RayCluster can be scaled down by MaxSurgePercent.
+ // 2. The total target_capacity is equal to 100. This means the pending RayCluster can
+ // increase its target_capacity by MaxSurgePercent.
+ // If the rayClusterInstance passed into this function is not the cluster to update based
+ // on the above conditions, we return without doing anything.
+ var goalTargetCapacity int32
+ shouldUpdate := false
+ if rayClusterInstance.Name == activeRayServiceStatus.RayClusterName {
+ if activeTargetCapacity+pendingTargetCapacity > 100 {
+ // Scale down the Active RayCluster TargetCapacity on this iteration.
+ goalTargetCapacity = max(int32(0), activeTargetCapacity-maxSurgePercent)
+ shouldUpdate = true
+ logger.Info("Setting target_capacity for active Raycluster", "Raycluster", rayClusterInstance.Name, "target_capacity", goalTargetCapacity)
+ }
+ } else if rayClusterInstance.Name == pendingRayServiceStatus.RayClusterName {
+ if activeTargetCapacity+pendingTargetCapacity <= 100 {
+ // Scale up the Pending RayCluster TargetCapacity on this iteration.
+ goalTargetCapacity = min(int32(100), pendingTargetCapacity+maxSurgePercent)
+ shouldUpdate = true
+ logger.Info("Setting target_capacity for pending Raycluster", "Raycluster", rayClusterInstance.Name, "target_capacity", goalTargetCapacity)
+ }
+ }
+
+ if !shouldUpdate {
+ return nil
+ }
+
+ return r.applyServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient, goalTargetCapacity)
+}
+
// `getAndCheckServeStatus` gets Serve applications' and deployments' statuses and check whether the
// Serve applications are ready to serve incoming traffic or not. It returns three values:
//
@@ -950,6 +1596,18 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
return false, serveApplications, err
}
+ skipConfigUpdate := false
+ isActiveCluster := rayClusterInstance.Name == rayServiceInstance.Status.ActiveServiceStatus.RayClusterName
+ isIncrementalUpgradeInProgress := utils.IsIncrementalUpgradeEnabled(&rayServiceInstance.Spec) &&
+ meta.IsStatusConditionTrue(rayServiceInstance.Status.Conditions, string(rayv1.UpgradeInProgress))
+
+ if isActiveCluster && isIncrementalUpgradeInProgress {
+ // Skip updating the Serve config for the Active cluster during NewClusterWithIncrementalUpgrade. The updated
+ // Serve config is applied to the pending RayService's RayCluster.
+ skipConfigUpdate = true
+ logger.Info("Blocking new Serve config submission for Active cluster during upgrade.", "clusterName", rayClusterInstance.Name)
+ }
+
cachedServeConfigV2 := r.getServeConfigFromCache(rayServiceInstance, rayClusterInstance.Name)
isReady, serveApplications, err := getAndCheckServeStatus(ctx, rayDashboardClient)
if err != nil {
@@ -958,13 +1616,26 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
shouldUpdate, reason := checkIfNeedSubmitServeApplications(cachedServeConfigV2, rayServiceInstance.Spec.ServeConfigV2, serveApplications)
logger.Info("checkIfNeedSubmitServeApplications", "shouldUpdate", shouldUpdate, "reason", reason)
- if shouldUpdate {
+ if shouldUpdate && !skipConfigUpdate {
if err = r.updateServeDeployment(ctx, rayServiceInstance, rayDashboardClient, rayClusterInstance.Name); err != nil {
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateServeApplications), "Failed to update serve applications to the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, err)
return false, serveApplications, err
}
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeApplications), "Updated serve applications to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name)
}
+ if isIncrementalUpgradeInProgress {
+ incrementalUpgradeUpdate, reason := r.checkIfNeedTargetCapacityUpdate(ctx, rayServiceInstance)
+ logger.Info("checkIfNeedTargetCapacityUpdate", "incrementalUpgradeUpdate", incrementalUpgradeUpdate, "reason", reason)
+ if incrementalUpgradeUpdate {
+ if err := r.reconcileServeTargetCapacity(ctx, rayServiceInstance, rayClusterInstance, rayDashboardClient); err != nil {
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateTargetCapacity), "Failed to update target_capacity of serve applications to the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, err)
+ return false, serveApplications, err
+ }
+ r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeTargetCapacity),
+ "Updated target_capacity of serve applications to to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name)
+ }
+ }
+
return isReady, serveApplications, nil
}
@@ -986,7 +1657,7 @@ func (r *RayServiceReconciler) updateHeadPodServeLabel(ctx context.Context, rayS
}
rayContainer := headPod.Spec.Containers[utils.RayContainerIndex]
- servingPort := utils.FindContainerPort(&rayContainer, utils.ServingPortName, utils.DefaultServingPort)
+ servingPort := int(utils.FindContainerPort(&rayContainer, utils.ServingPortName, utils.DefaultServingPort))
client := r.httpProxyClientFunc(headPod.Status.PodIP, headPod.Namespace, headPod.Name, servingPort)
if headPod.Labels == nil {
@@ -1041,3 +1712,34 @@ func (r *RayServiceReconciler) isHeadPodRunningAndReady(ctx context.Context, ins
}
return utils.IsRunningAndReady(headPod), nil
}
+
+// reconcilePerClusterServeService reconciles a load-balancing serve service for a given RayCluster.
+func (r *RayServiceReconciler) reconcilePerClusterServeService(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster) error {
+ if rayClusterInstance == nil {
+ return nil
+ }
+
+ logger := ctrl.LoggerFrom(ctx).WithValues("RayCluster", rayClusterInstance.Name)
+
+ logger.Info("Building per-cluster RayService")
+
+ // Create a serve service for the RayCluster associated with this RayService. During an incremental
+ // upgrade, this will be called for the pending RayCluster instance.
+ desiredSvc, err := common.BuildServeService(ctx, *rayServiceInstance, *rayClusterInstance, true)
+ if err != nil {
+ logger.Error(err, "Failed to build per-cluster serve service spec")
+ return err
+ }
+ if err := ctrl.SetControllerReference(rayClusterInstance, desiredSvc, r.Scheme); err != nil {
+ return err
+ }
+
+ existingSvc := &corev1.Service{}
+ err = r.Get(ctx, client.ObjectKey{Name: desiredSvc.Name, Namespace: desiredSvc.Namespace}, existingSvc)
+ if errors.IsNotFound(err) {
+ logger.Info("Creating new per-cluster serve service for incremental upgrade.", "Service", desiredSvc.Name)
+ return r.Create(ctx, desiredSvc)
+ }
+
+ return err
+}
diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go
index 638af6b26fb..169e6d2bc5d 100644
--- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go
+++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go
@@ -13,13 +13,16 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
+ "k8s.io/utils/lru"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
@@ -27,6 +30,7 @@ import (
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient"
utiltypes "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/types"
"github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme"
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
"github.com/ray-project/kuberay/ray-operator/test/support"
)
@@ -1319,3 +1323,906 @@ func TestRayClusterDeletionDelaySeconds(t *testing.T) {
})
}
}
+
+// Helper function to create a RayService object undergoing an incremental upgrade.
+func makeIncrementalUpgradeRayService(
+ withOptions bool,
+ gatewayClassName string,
+ stepSizePercent *int32,
+ intervalSeconds *int32,
+ routedPercent *int32,
+ lastTrafficMigratedTime *metav1.Time,
+) *rayv1.RayService {
+ spec := rayv1.RayServiceSpec{
+ ServeService: &corev1.Service{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "serve-service",
+ Namespace: "test-ns",
+ },
+ Spec: corev1.ServiceSpec{
+ Ports: []corev1.ServicePort{
+ {
+ Name: "http",
+ Port: 8000,
+ },
+ },
+ },
+ },
+ }
+ if withOptions {
+ spec.UpgradeStrategy = &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade),
+ ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{
+ GatewayClassName: gatewayClassName,
+ StepSizePercent: stepSizePercent,
+ IntervalSeconds: intervalSeconds,
+ },
+ }
+ }
+
+ return &rayv1.RayService{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "incremental-ray-service",
+ Namespace: "test-ns",
+ },
+ Spec: spec,
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: "active-ray-cluster",
+ RayClusterStatus: rayv1.RayClusterStatus{
+ Head: rayv1.HeadInfo{ServiceName: "active-service"},
+ },
+ TrafficRoutedPercent: routedPercent,
+ LastTrafficMigratedTime: lastTrafficMigratedTime,
+ },
+ PendingServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: "pending-ray-cluster",
+ RayClusterStatus: rayv1.RayClusterStatus{
+ Head: rayv1.HeadInfo{ServiceName: "pending-service"},
+ },
+ TrafficRoutedPercent: ptr.To(int32(100) - *routedPercent),
+ LastTrafficMigratedTime: lastTrafficMigratedTime,
+ },
+ },
+ }
+}
+
+func TestCreateGateway(t *testing.T) {
+ serveService := &corev1.Service{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "serve-service",
+ Namespace: "test-ns",
+ },
+ Spec: corev1.ServiceSpec{
+ Ports: []corev1.ServicePort{
+ {
+ Port: 8000,
+ },
+ },
+ },
+ }
+ newScheme := runtime.NewScheme()
+ _ = corev1.AddToScheme(newScheme)
+
+ fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(serveService).Build()
+ reconciler := &RayServiceReconciler{
+ Client: fakeClient,
+ }
+
+ tests := []struct {
+ rayService *rayv1.RayService
+ name string
+ expectedGatewayName string
+ expectedClass string
+ expectedListeners int
+ expectErr bool
+ }{
+ {
+ name: "valid gateway creation",
+ expectedGatewayName: "incremental-ray-service-gateway",
+ rayService: makeIncrementalUpgradeRayService(true, "gateway-class", ptr.To(int32(50)), ptr.To(int32(10)), ptr.To(int32(80)), &metav1.Time{Time: time.Now()}),
+ expectErr: false,
+ expectedClass: "gateway-class",
+ expectedListeners: 1,
+ },
+ {
+ name: "missing ClusterUpgradeOptions",
+ rayService: makeIncrementalUpgradeRayService(false, "gateway-class", ptr.To(int32(0)), ptr.To(int32(0)), ptr.To(int32(0)), &metav1.Time{Time: time.Now()}),
+ expectErr: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ gw, err := reconciler.createGateway(tt.rayService)
+ if tt.expectErr {
+ require.Error(t, err)
+ assert.Nil(t, gw)
+ } else {
+ require.NoError(t, err)
+ require.NotNil(t, gw)
+ assert.Equal(t, tt.expectedGatewayName, gw.Name)
+ assert.Equal(t, tt.rayService.Namespace, gw.Namespace)
+ assert.Equal(t, gwv1.ObjectName(tt.expectedClass), gw.Spec.GatewayClassName)
+ assert.Len(t, gw.Spec.Listeners, tt.expectedListeners)
+ }
+ })
+ }
+}
+
+func TestCreateHTTPRoute(t *testing.T) {
+ ctx := context.TODO()
+ namespace := "test-ns"
+ stepSize := int32(10)
+ interval := int32(30)
+
+ activeCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "rayservice-active", Namespace: namespace}}
+ pendingCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "rayservice-pending", Namespace: namespace}}
+ gateway := &gwv1.Gateway{ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice-gateway", Namespace: namespace}}
+ activeServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace}}
+ pendingServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace}}
+
+ baseRayService := &rayv1.RayService{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace},
+ Spec: rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade),
+ ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{
+ StepSizePercent: &stepSize,
+ IntervalSeconds: &interval,
+ GatewayClassName: "istio",
+ },
+ },
+ },
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: activeCluster.Name,
+ TrafficRoutedPercent: ptr.To(int32(100)),
+ TargetCapacity: ptr.To(int32(100)),
+ },
+ PendingServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: pendingCluster.Name,
+ TrafficRoutedPercent: ptr.To(int32(0)),
+ TargetCapacity: ptr.To(int32(30)),
+ },
+ },
+ }
+
+ tests := []struct {
+ name string
+ modifier func(rs *rayv1.RayService)
+ runtimeObjects []runtime.Object
+ expectError bool
+ expectedActiveWeight int32
+ expectedPendingWeight int32
+ isPendingClusterReady bool
+ }{
+ {
+ name: "NewClusterWithIncrementalUpgrade, but pending cluster is not ready, so no traffic shift.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)}
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ isPendingClusterReady: false,
+ expectedActiveWeight: 100,
+ expectedPendingWeight: 0,
+ },
+ {
+ name: "NewClusterWithIncrementalUpgrade, time since LastTrafficMigratedTime < IntervalSeconds.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()}
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ isPendingClusterReady: true,
+ expectedActiveWeight: 100,
+ expectedPendingWeight: 0,
+ },
+ {
+ name: "NewClusterWithIncrementalUpgrade, time since LastTrafficMigratedTime >= IntervalSeconds.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)}
+ rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(60))
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ isPendingClusterReady: true,
+ expectedActiveWeight: 90,
+ expectedPendingWeight: 10,
+ },
+ {
+ name: "NewClusterWithIncrementalUpgrade, TrafficRoutedPercent capped to pending TargetCapacity.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)}
+ rs.Status.PendingServiceStatus.TargetCapacity = ptr.To(int32(5))
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ isPendingClusterReady: true,
+ expectedActiveWeight: 95,
+ expectedPendingWeight: 5, // can only migrate 5% to pending until TargetCapacity reached
+ },
+ {
+ name: "Create HTTPRoute called with missing ClusterUpgradeOptions.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Spec.UpgradeStrategy.ClusterUpgradeOptions = nil
+ },
+ runtimeObjects: []runtime.Object{activeCluster, pendingCluster, gateway, activeServeService, pendingServeService},
+ isPendingClusterReady: true,
+ expectError: true,
+ },
+ {
+ name: "No on-going upgrade, pending cluster does not exist.",
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus = rayv1.RayServiceStatus{}
+ },
+ runtimeObjects: []runtime.Object{activeCluster, gateway, activeServeService},
+ isPendingClusterReady: false,
+ expectedActiveWeight: 100,
+ expectedPendingWeight: 0,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ rayService := baseRayService.DeepCopy()
+ tt.modifier(rayService)
+ tt.runtimeObjects = append(tt.runtimeObjects, rayService)
+
+ newScheme := runtime.NewScheme()
+ _ = rayv1.AddToScheme(newScheme)
+ _ = corev1.AddToScheme(newScheme)
+ _ = gwv1.AddToScheme(newScheme)
+ fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build()
+
+ reconciler := RayServiceReconciler{
+ Client: fakeClient,
+ Scheme: newScheme,
+ Recorder: record.NewFakeRecorder(1),
+ }
+
+ route, err := reconciler.createHTTPRoute(ctx, rayService, tt.isPendingClusterReady)
+
+ if tt.expectError {
+ require.Error(t, err)
+ assert.Nil(t, route)
+ } else {
+ require.NoError(t, err)
+ require.NotNil(t, route)
+
+ assert.Equal(t, "test-rayservice-httproute", route.Name)
+ assert.Equal(t, "test-ns", route.Namespace)
+
+ require.Len(t, route.Spec.Rules, 1)
+ rule := route.Spec.Rules[0]
+
+ require.GreaterOrEqual(t, len(rule.BackendRefs), 1)
+ assert.Equal(t, gwv1.ObjectName(activeServeService.Name), rule.BackendRefs[0].BackendRef.Name)
+ assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight)
+
+ if len(rule.BackendRefs) > 1 {
+ assert.Equal(t, gwv1.ObjectName(pendingServeService.Name), rule.BackendRefs[1].BackendRef.Name)
+ assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight)
+ } else {
+ assert.Equal(t, int32(0), tt.expectedPendingWeight)
+ }
+ }
+ })
+ }
+}
+
+func TestReconcileHTTPRoute(t *testing.T) {
+ newScheme := runtime.NewScheme()
+ _ = rayv1.AddToScheme(newScheme)
+ _ = corev1.AddToScheme(newScheme)
+ _ = gwv1.AddToScheme(newScheme)
+
+ ctx := context.TODO()
+ namespace := "test-ns"
+ stepSize := int32(10)
+ interval := int32(30)
+ gatewayName := "test-rayservice-gateway"
+ routeName := "test-rayservice-httproute"
+
+ activeCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "active-ray-cluster", Namespace: namespace}}
+ pendingCluster := &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "pending-ray-cluster", Namespace: namespace}}
+ activeServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(activeCluster.Name), Namespace: namespace}}
+ pendingServeService := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Name: utils.GenerateServeServiceName(pendingCluster.Name), Namespace: namespace}}
+ gateway := &gwv1.Gateway{ObjectMeta: metav1.ObjectMeta{Name: gatewayName, Namespace: namespace}}
+
+ baseRayService := &rayv1.RayService{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-rayservice", Namespace: namespace},
+ Spec: rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade),
+ ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{
+ StepSizePercent: &stepSize,
+ IntervalSeconds: &interval,
+ GatewayClassName: "istio",
+ },
+ },
+ },
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: activeCluster.Name,
+ TrafficRoutedPercent: ptr.To(int32(100)),
+ TargetCapacity: ptr.To(int32(100)),
+ },
+ PendingServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: pendingCluster.Name,
+ TrafficRoutedPercent: ptr.To(int32(0)),
+ TargetCapacity: ptr.To(int32(100)),
+ },
+ },
+ }
+
+ tests := []struct {
+ modifier func(rs *rayv1.RayService)
+ existingRoute *gwv1.HTTPRoute
+ name string
+ expectedActiveWeight int32
+ expectedPendingWeight int32
+ pendingClusterExists bool
+ isPendingClusterReady bool
+ }{
+ {
+ name: "Create HTTPRoute with no pending cluster.",
+ isPendingClusterReady: false,
+ pendingClusterExists: false,
+ expectedActiveWeight: 100,
+ expectedPendingWeight: 0,
+ },
+ {
+ name: "Create HTTPRoute when pending cluster exists, but is not ready.",
+ isPendingClusterReady: false,
+ pendingClusterExists: true,
+ expectedActiveWeight: 100,
+ expectedPendingWeight: 0,
+ },
+ {
+ name: "Create new HTTPRoute with existing weights.",
+ isPendingClusterReady: true,
+ pendingClusterExists: true,
+ expectedActiveWeight: 90,
+ expectedPendingWeight: 10,
+ },
+ {
+ name: "Update HTTPRoute when pending cluster is ready.",
+ isPendingClusterReady: true,
+ pendingClusterExists: true,
+ expectedActiveWeight: 90,
+ expectedPendingWeight: 10,
+ },
+ {
+ name: "Existing HTTPRoute, time since LastTrafficMigratedTime >= IntervalSeconds so updates HTTPRoute.",
+ isPendingClusterReady: true,
+ pendingClusterExists: true,
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now().Add(-time.Duration(interval+1) * time.Second)}
+ },
+ existingRoute: &gwv1.HTTPRoute{
+ ObjectMeta: metav1.ObjectMeta{Name: routeName, Namespace: namespace},
+ Spec: gwv1.HTTPRouteSpec{},
+ },
+ expectedActiveWeight: 90,
+ expectedPendingWeight: 10,
+ },
+ {
+ name: "Existing HTTPRoute, time since LastTrafficMigratedTime < IntervalSeconds so no update.",
+ isPendingClusterReady: true,
+ pendingClusterExists: true,
+ modifier: func(rs *rayv1.RayService) {
+ rs.Status.PendingServiceStatus.LastTrafficMigratedTime = &metav1.Time{Time: time.Now()}
+ },
+ expectedActiveWeight: 100,
+ expectedPendingWeight: 0,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ rayService := baseRayService.DeepCopy()
+ if tt.modifier != nil {
+ tt.modifier(rayService)
+ }
+
+ if !tt.pendingClusterExists {
+ rayService.Status.PendingServiceStatus.RayClusterName = ""
+ }
+
+ runtimeObjects := []runtime.Object{rayService, activeCluster, pendingCluster, gateway, activeServeService, pendingServeService}
+ if tt.existingRoute != nil {
+ runtimeObjects = append(runtimeObjects, tt.existingRoute)
+ }
+
+ fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(runtimeObjects...).Build()
+ reconciler := RayServiceReconciler{Client: fakeClient, Scheme: newScheme, Recorder: record.NewFakeRecorder(10)}
+
+ reconciledRoute, err := reconciler.reconcileHTTPRoute(ctx, rayService, tt.isPendingClusterReady)
+ require.NoError(t, err)
+
+ require.Len(t, reconciledRoute.Spec.Rules, 1)
+ rule := reconciledRoute.Spec.Rules[0]
+ if tt.pendingClusterExists {
+ require.Len(t, rule.BackendRefs, 2)
+ // Assert weights are set as expected.
+ assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight)
+ assert.Equal(t, tt.expectedPendingWeight, *rule.BackendRefs[1].Weight)
+ } else {
+ require.Len(t, rule.BackendRefs, 1)
+ // Assert active weight is as expected.
+ assert.Equal(t, tt.expectedActiveWeight, *rule.BackendRefs[0].Weight)
+ }
+ // Assert ParentRef namespace is correctly set.
+ parent := reconciledRoute.Spec.ParentRefs[0]
+ assert.Equal(t, gwv1.ObjectName(gatewayName), parent.Name)
+ assert.Equal(t, ptr.To(gwv1.Namespace(namespace)), parent.Namespace)
+ })
+ }
+}
+
+func TestReconcileGateway(t *testing.T) {
+ newScheme := runtime.NewScheme()
+ _ = rayv1.AddToScheme(newScheme)
+ _ = corev1.AddToScheme(newScheme)
+ _ = gwv1.AddToScheme(newScheme)
+
+ ctx := context.TODO()
+ namespace := "test-ns"
+
+ rayService := makeIncrementalUpgradeRayService(
+ true,
+ "gateway-class",
+ ptr.To(int32(20)),
+ ptr.To(int32(30)),
+ ptr.To(int32(80)),
+ ptr.To(metav1.Now()),
+ )
+ gateway := makeGateway(fmt.Sprintf("%s-gateway", rayService.Name), rayService.Namespace, true)
+
+ tests := []struct {
+ name string
+ expectedGatewayName string
+ expectedClass string
+ runtimeObjects []runtime.Object
+ expectedNumListeners int
+ }{
+ {
+ name: "creates new Gateway if missing",
+ runtimeObjects: []runtime.Object{rayService},
+ expectedGatewayName: "incremental-ray-service-gateway",
+ expectedClass: "gateway-class",
+ expectedNumListeners: 1,
+ },
+ {
+ name: "updates Gateway if spec differs",
+ runtimeObjects: []runtime.Object{rayService, gateway},
+ expectedGatewayName: "incremental-ray-service-gateway",
+ expectedClass: "gateway-class",
+ expectedNumListeners: 1,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ fakeClient := clientFake.NewClientBuilder().
+ WithScheme(newScheme).
+ WithRuntimeObjects(tt.runtimeObjects...).
+ Build()
+
+ reconciler := RayServiceReconciler{
+ Client: fakeClient,
+ Scheme: newScheme,
+ Recorder: record.NewFakeRecorder(10),
+ }
+
+ err := reconciler.reconcileGateway(ctx, rayService)
+ require.NoError(t, err)
+
+ reconciledGateway := &gwv1.Gateway{}
+ err = fakeClient.Get(ctx, client.ObjectKey{Name: tt.expectedGatewayName, Namespace: namespace}, reconciledGateway)
+ require.NoError(t, err, "Failed to get the reconciled Gateway")
+
+ assert.Equal(t, tt.expectedGatewayName, reconciledGateway.Name)
+ assert.Equal(t, namespace, reconciledGateway.Namespace)
+ assert.Equal(t, gwv1.ObjectName(tt.expectedClass), reconciledGateway.Spec.GatewayClassName)
+ assert.Len(t, reconciledGateway.Spec.Listeners, tt.expectedNumListeners)
+ })
+ }
+}
+
+func TestReconcileServeTargetCapacity(t *testing.T) {
+ features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true)
+
+ tests := []struct {
+ name string
+ updatedCluster string
+ activeCapacity int32
+ pendingCapacity int32
+ activeRoutedPercent int32
+ pendingRoutedPercent int32
+ maxSurgePercent int32
+ expectedActiveCapacity int32
+ expectedPendingCapacity int32
+ }{
+ {
+ name: "Scale up pending RayCluster when total TargetCapacity < 100",
+ pendingRoutedPercent: 10,
+ activeCapacity: 70,
+ pendingCapacity: 10,
+ maxSurgePercent: 20,
+ expectedActiveCapacity: 70,
+ expectedPendingCapacity: 30,
+ updatedCluster: "pending",
+ },
+ {
+ name: "Scale down active RayCluster when total TargetCapacity > 100",
+ pendingRoutedPercent: 30,
+ activeCapacity: 80,
+ pendingCapacity: 30,
+ maxSurgePercent: 20,
+ expectedActiveCapacity: 60,
+ expectedPendingCapacity: 30,
+ updatedCluster: "active",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ ctx := context.TODO()
+ rayService := &rayv1.RayService{
+ Spec: rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade),
+ ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{
+ MaxSurgePercent: ptr.To(tt.maxSurgePercent),
+ },
+ },
+ ServeConfigV2: `{"target_capacity": 0}`,
+ },
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: "active",
+ TargetCapacity: ptr.To(tt.activeCapacity),
+ TrafficRoutedPercent: ptr.To(tt.activeRoutedPercent),
+ },
+ PendingServiceStatus: rayv1.RayServiceStatus{
+ RayClusterName: "pending",
+ TargetCapacity: ptr.To(tt.pendingCapacity),
+ TrafficRoutedPercent: ptr.To(tt.pendingRoutedPercent),
+ },
+ },
+ }
+
+ var rayCluster *rayv1.RayCluster
+ if tt.updatedCluster == "active" {
+ rayCluster = &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "active"}}
+ } else {
+ rayCluster = &rayv1.RayCluster{ObjectMeta: metav1.ObjectMeta{Name: "pending"}}
+ }
+
+ fakeDashboard := &utils.FakeRayDashboardClient{}
+ reconciler := &RayServiceReconciler{
+ ServeConfigs: lru.New(10),
+ }
+
+ err := reconciler.reconcileServeTargetCapacity(ctx, rayService, rayCluster, fakeDashboard)
+ require.NoError(t, err)
+ require.NotEmpty(t, fakeDashboard.LastUpdatedConfig)
+
+ if tt.updatedCluster == "active" {
+ assert.Equal(t, tt.expectedActiveCapacity, *rayService.Status.ActiveServiceStatus.TargetCapacity)
+ assert.Equal(t, tt.pendingCapacity, *rayService.Status.PendingServiceStatus.TargetCapacity)
+ expectedServeConfig := `{"target_capacity":` + strconv.Itoa(int(tt.expectedActiveCapacity)) + `}`
+ assert.JSONEq(t, expectedServeConfig, string(fakeDashboard.LastUpdatedConfig))
+ } else {
+ assert.Equal(t, tt.expectedPendingCapacity, *rayService.Status.PendingServiceStatus.TargetCapacity)
+ assert.Equal(t, tt.activeCapacity, *rayService.Status.ActiveServiceStatus.TargetCapacity)
+ expectedServeConfig := `{"target_capacity":` + strconv.Itoa(int(tt.expectedPendingCapacity)) + `}`
+ assert.JSONEq(t, expectedServeConfig, string(fakeDashboard.LastUpdatedConfig))
+ }
+ })
+ }
+}
+
+// MakeGateway is a helper function to return an Gateway object
+func makeGateway(name, namespace string, isReady bool) *gwv1.Gateway {
+ status := metav1.ConditionFalse
+ if isReady {
+ status = metav1.ConditionTrue
+ }
+ return &gwv1.Gateway{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ Namespace: namespace,
+ },
+ Status: gwv1.GatewayStatus{
+ Conditions: []metav1.Condition{
+ {
+ Type: string(gwv1.GatewayConditionAccepted),
+ Status: status,
+ },
+ {
+ Type: string(gwv1.GatewayConditionProgrammed),
+ Status: status,
+ },
+ },
+ },
+ }
+}
+
+// MakeHTTPRoute is a helper function to return an HTTPRoute object
+func makeHTTPRoute(name, namespace string, isReady bool) *gwv1.HTTPRoute {
+ status := metav1.ConditionFalse
+ if isReady {
+ status = metav1.ConditionTrue
+ }
+ return &gwv1.HTTPRoute{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ Namespace: namespace,
+ },
+ Status: gwv1.HTTPRouteStatus{
+ RouteStatus: gwv1.RouteStatus{
+ Parents: []gwv1.RouteParentStatus{
+ {
+ ParentRef: gwv1.ParentReference{
+ Name: gwv1.ObjectName("test-rayservice-gateway"),
+ Namespace: ptr.To(gwv1.Namespace(namespace)),
+ },
+ Conditions: []metav1.Condition{
+ {
+ Type: string(gwv1.RouteConditionAccepted),
+ Status: status,
+ },
+ {
+ Type: string(gwv1.RouteConditionResolvedRefs),
+ Status: status,
+ },
+ },
+ },
+ },
+ },
+ },
+ }
+}
+
+func TestCheckIfNeedTargetCapacityUpdate(t *testing.T) {
+ rayServiceName := "test-rayservice"
+ gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway")
+ httpRouteName := fmt.Sprintf("%s-%s", rayServiceName, "httproute")
+ namespace := "test-ns"
+
+ tests := []struct {
+ name string
+ expectedReason string
+ runtimeObjects []runtime.Object
+ activeStatus rayv1.RayServiceStatus
+ pendingStatus rayv1.RayServiceStatus
+ expectedNeedsUpdate bool
+ }{
+ {
+ name: "Missing RayClusterNames",
+ expectedNeedsUpdate: false,
+ expectedReason: "Both active and pending RayCluster instances are required for NewClusterWithIncrementalUpgrade.",
+ },
+ {
+ name: "Gateway not ready",
+ activeStatus: rayv1.RayServiceStatus{RayClusterName: "active"},
+ pendingStatus: rayv1.RayServiceStatus{RayClusterName: "pending"},
+ runtimeObjects: []runtime.Object{
+ makeGateway(gatewayName, namespace, false), makeHTTPRoute(httpRouteName, namespace, true),
+ },
+ expectedNeedsUpdate: false,
+ expectedReason: "Gateway for RayService NewClusterWithIncrementalUpgrade is not ready.",
+ },
+ {
+ name: "HTTPRoute not ready",
+ activeStatus: rayv1.RayServiceStatus{RayClusterName: "active"},
+ pendingStatus: rayv1.RayServiceStatus{RayClusterName: "pending"},
+ runtimeObjects: []runtime.Object{
+ makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, false),
+ },
+ expectedNeedsUpdate: false,
+ expectedReason: "HTTPRoute for RayService NewClusterWithIncrementalUpgrade is not ready.",
+ },
+ {
+ name: "NewClusterWithIncrementalUpgrade is complete",
+ activeStatus: rayv1.RayServiceStatus{
+ RayClusterName: "active",
+ TargetCapacity: ptr.To(int32(0)),
+ TrafficRoutedPercent: ptr.To(int32(0)),
+ },
+ pendingStatus: rayv1.RayServiceStatus{
+ RayClusterName: "pending",
+ TargetCapacity: ptr.To(int32(100)),
+ TrafficRoutedPercent: ptr.To(int32(100)),
+ },
+ runtimeObjects: []runtime.Object{
+ makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, true),
+ },
+ expectedNeedsUpdate: false,
+ expectedReason: "All traffic has migrated to the upgraded cluster and NewClusterWithIncrementalUpgrade is complete.",
+ },
+ {
+ name: "Pending RayCluster is still incrementally scaling",
+ activeStatus: rayv1.RayServiceStatus{
+ RayClusterName: "active",
+ TargetCapacity: ptr.To(int32(70)),
+ TrafficRoutedPercent: ptr.To(int32(70)),
+ },
+ pendingStatus: rayv1.RayServiceStatus{
+ RayClusterName: "pending",
+ TargetCapacity: ptr.To(int32(30)),
+ TrafficRoutedPercent: ptr.To(int32(30)),
+ },
+ runtimeObjects: []runtime.Object{
+ makeGateway(gatewayName, namespace, true), makeHTTPRoute(httpRouteName, namespace, true),
+ },
+ expectedNeedsUpdate: true,
+ expectedReason: "Pending RayCluster has not finished scaling up.",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ newScheme := runtime.NewScheme()
+ _ = corev1.AddToScheme(newScheme)
+ _ = gwv1.AddToScheme(newScheme)
+ fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build()
+ // Initialize RayService reconciler.
+ ctx := context.TODO()
+ r := RayServiceReconciler{
+ Client: fakeClient,
+ Recorder: &record.FakeRecorder{},
+ Scheme: scheme.Scheme,
+ }
+ rayService := &rayv1.RayService{
+ ObjectMeta: metav1.ObjectMeta{Name: rayServiceName, Namespace: namespace},
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: tt.activeStatus,
+ PendingServiceStatus: tt.pendingStatus,
+ },
+ }
+ needsUpdate, reason := r.checkIfNeedTargetCapacityUpdate(ctx, rayService)
+ assert.Equal(t, tt.expectedNeedsUpdate, needsUpdate)
+ assert.Equal(t, tt.expectedReason, reason)
+ })
+ }
+}
+
+func TestReconcilePerClusterServeService(t *testing.T) {
+ features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true)
+
+ ctx := context.TODO()
+ namespace := "test-ns"
+
+ // Minimal RayCluster with at least one container.
+ rayCluster := &rayv1.RayCluster{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-ray-cluster",
+ Namespace: namespace,
+ UID: "test-uid",
+ },
+ Spec: rayv1.RayClusterSpec{
+ HeadGroupSpec: rayv1.HeadGroupSpec{
+ Template: corev1.PodTemplateSpec{
+ Spec: corev1.PodSpec{
+ Containers: []corev1.Container{
+ {Name: "ray-head"},
+ },
+ },
+ },
+ },
+ },
+ }
+ rayService := makeIncrementalUpgradeRayService(
+ true,
+ "istio",
+ ptr.To(int32(20)),
+ ptr.To(int32(30)),
+ ptr.To(int32(80)),
+ ptr.To(metav1.Now()),
+ )
+
+ // The expected pending RayCluster serve service.
+ expectedServeSvcName := utils.GenerateServeServiceName(rayCluster.Name)
+ expectedServeService := &corev1.Service{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: expectedServeSvcName,
+ Namespace: namespace,
+ },
+ Spec: corev1.ServiceSpec{
+ Selector: map[string]string{
+ utils.RayClusterLabelKey: rayCluster.Name,
+ utils.RayClusterServingServiceLabelKey: "true",
+ },
+ },
+ }
+
+ tests := []struct {
+ name string
+ rayCluster *rayv1.RayCluster
+ runtimeObjects []runtime.Object
+ expectServiceCreated bool
+ expectError bool
+ }{
+ {
+ name: "RayCluster is nil, no-op.",
+ rayCluster: nil,
+ runtimeObjects: []runtime.Object{rayService},
+ expectServiceCreated: false,
+ expectError: false,
+ },
+ {
+ name: "Create a new Serve service for the RayCluster.",
+ rayCluster: rayCluster,
+ runtimeObjects: []runtime.Object{rayService, rayCluster},
+ expectServiceCreated: true,
+ expectError: false,
+ },
+ {
+ name: "Pending RayCluster serve service already exists, no-op.",
+ rayCluster: rayCluster,
+ runtimeObjects: []runtime.Object{rayService, rayCluster, expectedServeService},
+ expectServiceCreated: false,
+ expectError: false,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ newScheme := runtime.NewScheme()
+ _ = rayv1.AddToScheme(newScheme)
+ _ = corev1.AddToScheme(newScheme)
+
+ fakeClient := clientFake.NewClientBuilder().WithScheme(newScheme).WithRuntimeObjects(tt.runtimeObjects...).Build()
+ reconciler := RayServiceReconciler{
+ Client: fakeClient,
+ Scheme: newScheme,
+ Recorder: record.NewFakeRecorder(1),
+ }
+
+ err := reconciler.reconcilePerClusterServeService(ctx, rayService, tt.rayCluster)
+
+ if tt.expectError {
+ require.Error(t, err)
+ return
+ }
+ require.NoError(t, err)
+
+ reconciledSvc := &corev1.Service{}
+ err = fakeClient.Get(ctx, client.ObjectKey{Name: expectedServeSvcName, Namespace: namespace}, reconciledSvc)
+
+ // No-op case, no service should be created when RayCluster is nil.
+ if tt.rayCluster == nil {
+ assert.True(t, errors.IsNotFound(err))
+ return
+ }
+
+ // Otherwise, a valid serve service should be created for the RayCluster.
+ require.NoError(t, err, "The Serve service should exist in the client")
+
+ // Validate the expected Serve service exists for the RayCluster.
+ require.NotNil(t, reconciledSvc)
+ assert.Equal(t, expectedServeSvcName, reconciledSvc.Name)
+
+ createdSvc := &corev1.Service{}
+ err = fakeClient.Get(ctx, client.ObjectKey{Name: expectedServeSvcName, Namespace: namespace}, createdSvc)
+ require.NoError(t, err, "The Serve service should exist in the client")
+
+ // Verify the Serve service selector.
+ expectedSelector := map[string]string{
+ utils.RayClusterLabelKey: rayCluster.Name,
+ utils.RayClusterServingServiceLabelKey: "true",
+ }
+ assert.Equal(t, expectedSelector, createdSvc.Spec.Selector)
+
+ // Validate owner ref is set to the expected RayCluster.
+ if tt.expectServiceCreated {
+ require.Len(t, createdSvc.OwnerReferences, 1)
+ ownerRef := createdSvc.OwnerReferences[0]
+ assert.Equal(t, rayCluster.Name, ownerRef.Name)
+ assert.Equal(t, "RayCluster", ownerRef.Kind)
+ assert.Equal(t, rayCluster.UID, ownerRef.UID)
+ }
+ })
+ }
+}
diff --git a/ray-operator/controllers/ray/utils/consistency.go b/ray-operator/controllers/ray/utils/consistency.go
index 2c2ba0fe616..4d04e9f5e3d 100644
--- a/ray-operator/controllers/ray/utils/consistency.go
+++ b/ray-operator/controllers/ray/utils/consistency.go
@@ -4,6 +4,7 @@ import (
"reflect"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
)
// Checks whether the old and new RayClusterStatus are inconsistent by comparing different fields. If the only
@@ -74,6 +75,15 @@ func inconsistentRayServiceStatus(oldStatus rayv1.RayServiceStatus, newStatus ra
}
}
+ if features.Enabled(features.RayServiceIncrementalUpgrade) {
+ // Also check for changes in IncrementalUpgrade related Status fields.
+ if oldStatus.TrafficRoutedPercent != newStatus.TrafficRoutedPercent ||
+ oldStatus.TargetCapacity != newStatus.TargetCapacity ||
+ oldStatus.LastTrafficMigratedTime != newStatus.LastTrafficMigratedTime {
+ return true
+ }
+ }
+
return false
}
diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go
index 20e44e0f888..c4988850137 100644
--- a/ray-operator/controllers/ray/utils/constant.go
+++ b/ray-operator/controllers/ray/utils/constant.go
@@ -87,6 +87,10 @@ const (
MetricsPortName = "metrics"
ServingPortName = "serve"
+ // Gateway defaults for HTTP protocol
+ GatewayListenerPortName = "http"
+ DefaultGatewayListenerPort = 80
+
// The default AppProtocol for Kubernetes service
DefaultServiceAppProtocol = "tcp"
@@ -324,12 +328,22 @@ const (
RayClusterNotFound K8sEventType = "RayClusterNotFound"
// RayService event list
+ CreatedGateway K8sEventType = "CreatedGateway"
+ CreatedHTTPRoute K8sEventType = "CreatedHTTPRoute"
InvalidRayServiceSpec K8sEventType = "InvalidRayServiceSpec"
InvalidRayServiceMetadata K8sEventType = "InvalidRayServiceMetadata"
UpdatedHeadPodServeLabel K8sEventType = "UpdatedHeadPodServeLabel"
+ UpdatedGateway K8sEventType = "UpdatedGateway"
+ UpdatedHTTPRoute K8sEventType = "UpdatedHTTPRoute"
UpdatedServeApplications K8sEventType = "UpdatedServeApplications"
+ UpdatedServeTargetCapacity K8sEventType = "UpdatedServeTargetCapacity"
FailedToUpdateHeadPodServeLabel K8sEventType = "FailedToUpdateHeadPodServeLabel"
FailedToUpdateServeApplications K8sEventType = "FailedToUpdateServeApplications"
+ FailedToUpdateTargetCapacity K8sEventType = "FailedToUpdateTargetCapacity"
+ FailedToCreateGateway K8sEventType = "FailedToCreateGateway"
+ FailedToUpdateGateway K8sEventType = "FailedToUpdateGateway"
+ FailedToCreateHTTPRoute K8sEventType = "FailedToCreateHTTPRoute"
+ FailedToUpdateHTTPRoute K8sEventType = "FailedToUpdateHTTPRoute"
// Generic Pod event list
DeletedPod K8sEventType = "DeletedPod"
diff --git a/ray-operator/controllers/ray/utils/fake_serve_httpclient.go b/ray-operator/controllers/ray/utils/fake_serve_httpclient.go
index 21a3fdb91be..1bf0588c403 100644
--- a/ray-operator/controllers/ray/utils/fake_serve_httpclient.go
+++ b/ray-operator/controllers/ray/utils/fake_serve_httpclient.go
@@ -12,9 +12,10 @@ import (
)
type FakeRayDashboardClient struct {
- multiAppStatuses map[string]*utiltypes.ServeApplicationStatus
- GetJobInfoMock atomic.Pointer[func(context.Context, string) (*utiltypes.RayJobInfo, error)]
- serveDetails utiltypes.ServeDetails
+ multiAppStatuses map[string]*utiltypes.ServeApplicationStatus
+ GetJobInfoMock atomic.Pointer[func(context.Context, string) (*utiltypes.RayJobInfo, error)]
+ serveDetails utiltypes.ServeDetails
+ LastUpdatedConfig []byte
}
var _ dashboardclient.RayDashboardClientInterface = (*FakeRayDashboardClient)(nil)
@@ -22,7 +23,8 @@ var _ dashboardclient.RayDashboardClientInterface = (*FakeRayDashboardClient)(ni
func (r *FakeRayDashboardClient) InitClient(_ *http.Client, _ string) {
}
-func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, _ []byte) error {
+func (r *FakeRayDashboardClient) UpdateDeployments(_ context.Context, configJson []byte) error {
+ r.LastUpdatedConfig = configJson
fmt.Print("UpdateDeployments fake succeeds.")
return nil
}
diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go
index e7406b44c99..540162bf9f4 100644
--- a/ray-operator/controllers/ray/utils/util.go
+++ b/ray-operator/controllers/ray/utils/util.go
@@ -16,17 +16,21 @@ import (
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
+ meta "k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/json"
"k8s.io/apimachinery/pkg/util/rand"
"k8s.io/client-go/discovery"
+ "k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient"
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
)
const (
@@ -605,10 +609,10 @@ func GenerateJsonHash(obj interface{}) (string, error) {
// FindContainerPort searches for a specific port $portName in the container.
// If the port is found in the container, the corresponding port is returned.
// If the port is not found, the $defaultPort is returned instead.
-func FindContainerPort(container *corev1.Container, portName string, defaultPort int) int {
+func FindContainerPort(container *corev1.Container, portName string, defaultPort int32) int32 {
for _, port := range container.Ports {
if port.Name == portName {
- return int(port.ContainerPort)
+ return port.ContainerPort
}
}
return defaultPort
@@ -681,6 +685,115 @@ func GetRayClusterNameFromService(svc *corev1.Service) string {
return svc.Spec.Selector[RayClusterLabelKey]
}
+// IsGatewayReady checks if a Gateway is considered "ready".
+//
+// A Gateway is "ready" only if both the `Accepted` and `Programmed` conditions
+// are set to 'True'.
+//
+// 1. 'Accepted': Signifies that the Gateway controller understands and accepts
+// the Gateway resource. If 'False', it often indicates a conflict or an invalid
+// specification.
+//
+// 2. 'Programmed': Signifies that the underlying network infrastructure for the Gateway
+// (e.g. load balancer) has been successfully provisioned and configured.
+func IsGatewayReady(gatewayInstance *gwv1.Gateway) bool {
+ if gatewayInstance == nil {
+ return false
+ }
+
+ hasAccepted := meta.IsStatusConditionTrue(gatewayInstance.Status.Conditions, string(gwv1.GatewayConditionAccepted))
+ hasProgrammed := meta.IsStatusConditionTrue(gatewayInstance.Status.Conditions, string(gwv1.GatewayConditionProgrammed))
+
+ return hasAccepted && hasProgrammed
+}
+
+// IsHTTPRouteReady checks if an HTTPRoute is considered ready for a given Gateway.
+//
+// It returns true only if the route's parent status entry matching the Gateway has both
+// the 'Accepted' and 'ResolvedRefs' conditions set to 'True'.
+//
+// 1. 'Accepted': Signifies that the Gateway controller has validated the HTTPRoute's
+// configuration (e.g. syntax, filters, matching rules). An 'Accepted' status of
+// 'False' means the route's specification is invalid.
+//
+// 2. 'ResolvedRefs': Signifies that all references within the route are valid, exist,
+// and are resolvable by the Gateway.
+func IsHTTPRouteReady(gatewayInstance *gwv1.Gateway, httpRouteInstance *gwv1.HTTPRoute) bool {
+ if httpRouteInstance == nil {
+ return false
+ }
+ for _, parent := range httpRouteInstance.Status.Parents {
+ if parent.ParentRef.Name != gwv1.ObjectName(gatewayInstance.Name) {
+ continue
+ }
+ if parent.ParentRef.Namespace != nil && *parent.ParentRef.Namespace != gwv1.Namespace(gatewayInstance.Namespace) {
+ continue
+ }
+ hasAccepted := meta.IsStatusConditionTrue(parent.Conditions, string(gwv1.RouteConditionAccepted))
+ hasResolved := meta.IsStatusConditionTrue(parent.Conditions, string(gwv1.RouteConditionResolvedRefs))
+
+ if hasAccepted && hasResolved {
+ return true
+ }
+ }
+ return false
+}
+
+func IsIncrementalUpgradeEnabled(spec *rayv1.RayServiceSpec) bool {
+ if !features.Enabled(features.RayServiceIncrementalUpgrade) {
+ return false
+ }
+ return spec != nil && spec.UpgradeStrategy != nil &&
+ *spec.UpgradeStrategy.Type == rayv1.NewClusterWithIncrementalUpgrade
+}
+
+func GetRayServiceClusterUpgradeOptions(spec *rayv1.RayServiceSpec) *rayv1.ClusterUpgradeOptions {
+ if spec != nil && spec.UpgradeStrategy != nil {
+ return spec.UpgradeStrategy.ClusterUpgradeOptions
+ }
+ return nil
+}
+
+// IsIncrementalUpgradeComplete checks if the conditions for completing an incremental upgrade are met.
+func IsIncrementalUpgradeComplete(rayServiceInstance *rayv1.RayService, pendingCluster *rayv1.RayCluster) bool {
+ return pendingCluster != nil &&
+ ptr.Deref(rayServiceInstance.Status.ActiveServiceStatus.TargetCapacity, -1) == 0 &&
+ ptr.Deref(rayServiceInstance.Status.PendingServiceStatus.TrafficRoutedPercent, -1) == 100
+}
+
+// GetWeightsFromHTTPRoute parses a given HTTPRoute object and extracts the traffic weights
+// for the active and pending clusters (if present) of a RayService.
+func GetWeightsFromHTTPRoute(httpRoute *gwv1.HTTPRoute, rayServiceInstance *rayv1.RayService) (activeWeight int32, pendingWeight int32) {
+ var activeClusterName, pendingClusterName string
+ if rayServiceInstance != nil {
+ activeClusterName = rayServiceInstance.Status.ActiveServiceStatus.RayClusterName
+ pendingClusterName = rayServiceInstance.Status.PendingServiceStatus.RayClusterName
+ }
+
+ // Defaults if weights can't be detected. This is so that we avoid setting TrafficRoutedPercent
+ // before the HTTPRoute actually exists.
+ activeWeight = -1
+ pendingWeight = -1
+
+ if httpRoute == nil || len(httpRoute.Spec.Rules) == 0 || len(httpRoute.Spec.Rules[0].BackendRefs) == 0 {
+ return
+ }
+
+ for _, backendRef := range httpRoute.Spec.Rules[0].BackendRefs {
+ backendName := string(backendRef.Name)
+ weight := ptr.Deref(backendRef.Weight, 0)
+
+ if activeClusterName != "" && strings.Contains(backendName, activeClusterName) {
+ activeWeight = weight
+ }
+ if pendingClusterName != "" && strings.Contains(backendName, pendingClusterName) {
+ pendingWeight = weight
+ }
+ }
+
+ return
+}
+
// Check where we are running. We are trying to distinguish here whether
// this is vanilla kubernetes cluster or Openshift
func GetClusterType() bool {
diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go
index 851e37af3ea..8bd37a2e7f8 100644
--- a/ray-operator/controllers/ray/utils/util_test.go
+++ b/ray-operator/controllers/ray/utils/util_test.go
@@ -12,9 +12,11 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/dashboardclient"
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
)
func TestGetClusterDomainName(t *testing.T) {
@@ -486,11 +488,11 @@ func TestFindContainerPort(t *testing.T) {
},
}
port := FindContainerPort(&container, "port1", -1)
- assert.NotEqual(t, port, -1, "expect port1 found")
+ assert.NotEqual(t, port, int32(-1), "expect port1 found")
port = FindContainerPort(&container, "port2", -1)
- assert.NotEqual(t, port, -1, "expect port2 found")
+ assert.NotEqual(t, port, int32(-1), "expect port2 found")
port = FindContainerPort(&container, "port3", -1)
- assert.Equal(t, port, -1, "expect port3 not found")
+ assert.Equal(t, port, int32(-1), "expect port3 not found")
}
func TestGenerateHeadServiceName(t *testing.T) {
@@ -1248,6 +1250,235 @@ func TestCalculateResources(t *testing.T) {
}
}
+// helper function to return a Gateway object with GatewayStatus Conditions for testing.
+func makeGatewayWithCondition(accepted bool, programmed bool) *gwv1.Gateway {
+ var conditions []metav1.Condition
+
+ if accepted {
+ conditions = append(conditions, metav1.Condition{
+ Type: string(gwv1.GatewayConditionAccepted),
+ Status: metav1.ConditionTrue,
+ })
+ }
+
+ if programmed {
+ conditions = append(conditions, metav1.Condition{
+ Type: string(gwv1.GatewayConditionProgrammed),
+ Status: metav1.ConditionTrue,
+ })
+ }
+
+ return &gwv1.Gateway{
+ Status: gwv1.GatewayStatus{
+ Conditions: conditions,
+ },
+ }
+}
+
+func TestIsGatewayReady(t *testing.T) {
+ tests := []struct {
+ gateway *gwv1.Gateway
+ name string
+ expected bool
+ }{
+ {
+ name: "missing Gateway instance",
+ gateway: nil,
+ expected: false,
+ },
+ {
+ name: "Gateway created with Programmed condition only",
+ gateway: makeGatewayWithCondition(false, true),
+ expected: false,
+ },
+ {
+ name: "Gateway created with Accepted condition only",
+ gateway: makeGatewayWithCondition(true, false),
+ expected: false,
+ },
+ {
+ name: "Gateway created with both Accepted and Programmed conditions",
+ gateway: makeGatewayWithCondition(true, true),
+ expected: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ assert.Equal(t, tt.expected, IsGatewayReady(tt.gateway))
+ })
+ }
+}
+
+// helper function to return a HTTPRoute with HTTPRouteStatus for testing
+func makeHTTPRouteWithParentRef(
+ parentRefName string,
+ namespace string,
+ accepted bool,
+ resolvedRefs bool,
+) *gwv1.HTTPRoute {
+ var acceptedStatus, resolvedRefsStatus metav1.ConditionStatus
+ if accepted {
+ acceptedStatus = metav1.ConditionTrue
+ } else {
+ acceptedStatus = metav1.ConditionFalse
+ }
+ if resolvedRefs {
+ resolvedRefsStatus = metav1.ConditionTrue
+ } else {
+ resolvedRefsStatus = metav1.ConditionFalse
+ }
+
+ return &gwv1.HTTPRoute{
+ Status: gwv1.HTTPRouteStatus{
+ RouteStatus: gwv1.RouteStatus{
+ Parents: []gwv1.RouteParentStatus{
+ {
+ ParentRef: gwv1.ParentReference{
+ Name: gwv1.ObjectName(parentRefName),
+ Namespace: ptr.To(gwv1.Namespace(namespace)),
+ },
+ Conditions: []metav1.Condition{
+ {
+ Type: string(gwv1.RouteConditionAccepted),
+ Status: acceptedStatus,
+ },
+ {
+ Type: string(gwv1.RouteConditionResolvedRefs),
+ Status: resolvedRefsStatus,
+ },
+ },
+ },
+ },
+ },
+ },
+ }
+}
+
+func TestIsHTTPRouteReady(t *testing.T) {
+ gateway := &gwv1.Gateway{
+ ObjectMeta: metav1.ObjectMeta{Name: "test-gateway", Namespace: "test-ns"},
+ }
+
+ tests := []struct {
+ httpRoute *gwv1.HTTPRoute
+ name string
+ expected bool
+ }{
+ {
+ name: "missing HTTPRoute",
+ httpRoute: nil,
+ expected: false,
+ },
+ {
+ name: "ParentRef does not match",
+ httpRoute: makeHTTPRouteWithParentRef("not-a-match", "other-test-ns", true, true),
+ expected: false,
+ },
+ {
+ name: "matching ParentRef with Accepted condition but without ResolvedRefs",
+ httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", true, false),
+ expected: false,
+ },
+ {
+ name: "matching ParentRef with ResolvedRefs but without Accepted",
+ httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", false, true),
+ expected: false,
+ },
+ {
+ name: "ready HTTPRoute with all required conditions",
+ httpRoute: makeHTTPRouteWithParentRef("test-gateway", "test-ns", true, true),
+ expected: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ assert.Equal(t, tt.expected, IsHTTPRouteReady(gateway, tt.httpRoute))
+ })
+ }
+}
+
+func TestIsIncrementalUpgradeEnabled(t *testing.T) {
+ tests := []struct {
+ spec *rayv1.RayServiceSpec
+ name string
+ featureEnabled bool
+ expected bool
+ }{
+ {
+ name: "missing UpgradeStrategy Type",
+ spec: &rayv1.RayServiceSpec{},
+ featureEnabled: true,
+ expected: false,
+ },
+ {
+ name: "UpgradeStrategy Type is NewClusterWithIncrementalUpgrade but feature disabled",
+ spec: &rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade),
+ },
+ },
+ featureEnabled: false,
+ expected: false,
+ },
+ {
+ name: "UpgradeStrategy Type is NewClusterWithIncrementalUpgrade and feature enabled",
+ spec: &rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade),
+ },
+ },
+ featureEnabled: true,
+ expected: true,
+ },
+ }
+
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, tc.featureEnabled)
+ assert.Equal(t, tc.expected, IsIncrementalUpgradeEnabled(tc.spec))
+ })
+ }
+}
+
+func TestGetRayServiceClusterUpgradeOptions(t *testing.T) {
+ upgradeOptions := &rayv1.ClusterUpgradeOptions{GatewayClassName: "gateway-class"}
+
+ tests := []struct {
+ rayServiceSpec *rayv1.RayServiceSpec
+ expectedOptions *rayv1.ClusterUpgradeOptions
+ name string
+ }{
+ {
+ name: "RayServiceSpec is nil, return nil ClusterUpgradeOptions",
+ rayServiceSpec: nil,
+ expectedOptions: nil,
+ },
+ {
+ name: "UpgradeStrategy is nil, return nil ClusterUpgradeOptions",
+ rayServiceSpec: &rayv1.RayServiceSpec{},
+ expectedOptions: nil,
+ },
+ {
+ name: "Valid ClusterUpgradeOptions",
+ rayServiceSpec: &rayv1.RayServiceSpec{
+ UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
+ ClusterUpgradeOptions: upgradeOptions,
+ },
+ },
+ expectedOptions: upgradeOptions,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ actualOptions := GetRayServiceClusterUpgradeOptions(tt.rayServiceSpec)
+ assert.Equal(t, tt.expectedOptions, actualOptions)
+ })
+ }
+}
+
func TestGetContainerCommand(t *testing.T) {
tests := []struct {
name string
@@ -1291,3 +1522,96 @@ func TestGetContainerCommand(t *testing.T) {
})
}
}
+
+func TestGetWeightsFromHTTPRoute(t *testing.T) {
+ activeClusterName := "rayservice-active"
+ pendingClusterName := "rayservice-pending"
+
+ // Helper to create a RayService with specified cluster names in its status.
+ makeRayService := func(activeName, pendingName string) *rayv1.RayService {
+ return &rayv1.RayService{
+ Status: rayv1.RayServiceStatuses{
+ ActiveServiceStatus: rayv1.RayServiceStatus{RayClusterName: activeName},
+ PendingServiceStatus: rayv1.RayServiceStatus{RayClusterName: pendingName},
+ },
+ }
+ }
+
+ // Helper to create an HTTPRoute with specified backend weights.
+ makeHTTPRoute := func(activeWeight, pendingWeight *int32) *gwv1.HTTPRoute {
+ backends := []gwv1.HTTPBackendRef{}
+ if activeWeight != nil {
+ backends = append(backends, gwv1.HTTPBackendRef{
+ BackendRef: gwv1.BackendRef{
+ BackendObjectReference: gwv1.BackendObjectReference{Name: gwv1.ObjectName(GenerateServeServiceName(activeClusterName))},
+ Weight: activeWeight,
+ },
+ })
+ }
+ if pendingWeight != nil {
+ backends = append(backends, gwv1.HTTPBackendRef{
+ BackendRef: gwv1.BackendRef{
+ BackendObjectReference: gwv1.BackendObjectReference{Name: gwv1.ObjectName(GenerateServeServiceName(pendingClusterName))},
+ Weight: pendingWeight,
+ },
+ })
+ }
+ return &gwv1.HTTPRoute{
+ Spec: gwv1.HTTPRouteSpec{
+ Rules: []gwv1.HTTPRouteRule{{BackendRefs: backends}},
+ },
+ }
+ }
+
+ tests := []struct {
+ httpRoute *gwv1.HTTPRoute
+ rayService *rayv1.RayService
+ name string
+ expectedActive int32
+ expectedPending int32
+ }{
+ {
+ name: "No HTTPRoute, return defaults for both weights",
+ httpRoute: nil,
+ rayService: makeRayService(activeClusterName, ""),
+ expectedActive: -1,
+ expectedPending: -1,
+ },
+ {
+ name: "HTTPRoute with missing backends, return defaults for both weights",
+ httpRoute: &gwv1.HTTPRoute{Spec: gwv1.HTTPRouteSpec{Rules: []gwv1.HTTPRouteRule{{}}}},
+ rayService: makeRayService(activeClusterName, pendingClusterName),
+ expectedActive: -1,
+ expectedPending: -1,
+ },
+ {
+ name: "Valid weights returned for both active and pending clusters",
+ httpRoute: makeHTTPRoute(ptr.To(int32(80)), ptr.To(int32(20))),
+ rayService: makeRayService(activeClusterName, pendingClusterName),
+ expectedActive: 80,
+ expectedPending: 20,
+ },
+ {
+ name: "Valid HTTPRoute with only active cluster backend",
+ httpRoute: makeHTTPRoute(ptr.To(int32(100)), nil),
+ rayService: makeRayService(activeClusterName, ""),
+ expectedActive: 100,
+ expectedPending: -1,
+ },
+ {
+ name: "Valid HTTPRoute with only pending cluster backend",
+ httpRoute: makeHTTPRoute(nil, ptr.To(int32(100))),
+ rayService: makeRayService("", pendingClusterName),
+ expectedActive: -1,
+ expectedPending: 100,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ active, pending := GetWeightsFromHTTPRoute(tt.httpRoute, tt.rayService)
+ assert.Equal(t, tt.expectedActive, active, "Active weight mismatch")
+ assert.Equal(t, tt.expectedPending, pending, "Pending weight mismatch")
+ })
+ }
+}
diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go
index 7e9097dd846..e03bbd319b8 100644
--- a/ray-operator/controllers/ray/utils/validation.go
+++ b/ray-operator/controllers/ray/utils/validation.go
@@ -306,12 +306,13 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error {
return fmt.Errorf("spec.rayClusterConfig.headGroupSpec.headService.metadata.name should not be set")
}
- // only NewCluster and None are valid upgradeType
+ // only NewClusterWithIncrementalUpgrade, NewCluster, and None are valid upgradeType
if rayService.Spec.UpgradeStrategy != nil &&
rayService.Spec.UpgradeStrategy.Type != nil &&
*rayService.Spec.UpgradeStrategy.Type != rayv1.None &&
- *rayService.Spec.UpgradeStrategy.Type != rayv1.NewCluster {
- return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.NewCluster, rayv1.None)
+ *rayService.Spec.UpgradeStrategy.Type != rayv1.NewCluster &&
+ *rayService.Spec.UpgradeStrategy.Type != rayv1.NewClusterWithIncrementalUpgrade {
+ return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s, %s, or %s", *rayService.Spec.UpgradeStrategy.Type, rayv1.NewClusterWithIncrementalUpgrade, rayv1.NewCluster, rayv1.None)
}
if rayService.Spec.RayClusterDeletionDelaySeconds != nil &&
@@ -319,6 +320,41 @@ func ValidateRayServiceSpec(rayService *rayv1.RayService) error {
return fmt.Errorf("Spec.RayClusterDeletionDelaySeconds should be a non-negative integer, got %d", *rayService.Spec.RayClusterDeletionDelaySeconds)
}
+ // If type is NewClusterWithIncrementalUpgrade, validate the ClusterUpgradeOptions
+ if IsIncrementalUpgradeEnabled(&rayService.Spec) {
+ return ValidateClusterUpgradeOptions(rayService)
+ }
+
+ return nil
+}
+
+func ValidateClusterUpgradeOptions(rayService *rayv1.RayService) error {
+ if !IsAutoscalingEnabled(&rayService.Spec.RayClusterSpec) {
+ return fmt.Errorf("Ray Autoscaler is required for NewClusterWithIncrementalUpgrade")
+ }
+
+ options := rayService.Spec.UpgradeStrategy.ClusterUpgradeOptions
+ if options == nil {
+ return fmt.Errorf("ClusterUpgradeOptions are required for NewClusterWithIncrementalUpgrade")
+ }
+
+ // MaxSurgePercent defaults to 100% if unset.
+ if *options.MaxSurgePercent < 0 || *options.MaxSurgePercent > 100 {
+ return fmt.Errorf("maxSurgePercent must be between 0 and 100")
+ }
+
+ if options.StepSizePercent == nil || *options.StepSizePercent < 0 || *options.StepSizePercent > 100 {
+ return fmt.Errorf("stepSizePercent must be between 0 and 100")
+ }
+
+ if options.IntervalSeconds == nil || *options.IntervalSeconds <= 0 {
+ return fmt.Errorf("intervalSeconds must be greater than 0")
+ }
+
+ if options.GatewayClassName == "" {
+ return fmt.Errorf("gatewayClassName is required for NewClusterWithIncrementalUpgrade")
+ }
+
return nil
}
diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go
index dbee9e612e7..2f3f7a64502 100644
--- a/ray-operator/controllers/ray/utils/validation_test.go
+++ b/ray-operator/controllers/ray/utils/validation_test.go
@@ -1664,3 +1664,112 @@ func createBasicRayClusterSpec() *rayv1.RayClusterSpec {
},
}
}
+
+func TestValidateClusterUpgradeOptions(t *testing.T) {
+ tests := []struct {
+ maxSurgePercent *int32
+ stepSizePercent *int32
+ intervalSeconds *int32
+ name string
+ gatewayClassName string
+ spec rayv1.RayServiceSpec
+ enableAutoscaling bool
+ expectError bool
+ }{
+ {
+ name: "valid config",
+ maxSurgePercent: ptr.To(int32(50)),
+ stepSizePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(10)),
+ gatewayClassName: "istio",
+ enableAutoscaling: true,
+ expectError: false,
+ },
+ {
+ name: "missing autoscaler",
+ maxSurgePercent: ptr.To(int32(50)),
+ stepSizePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(10)),
+ gatewayClassName: "istio",
+ enableAutoscaling: false,
+ expectError: true,
+ },
+ {
+ name: "missing options",
+ enableAutoscaling: true,
+ expectError: true,
+ },
+ {
+ name: "invalid MaxSurgePercent",
+ maxSurgePercent: ptr.To(int32(200)),
+ stepSizePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(10)),
+ gatewayClassName: "istio",
+ enableAutoscaling: true,
+ expectError: true,
+ },
+ {
+ name: "missing StepSizePercent",
+ maxSurgePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(10)),
+ gatewayClassName: "istio",
+ enableAutoscaling: true,
+ expectError: true,
+ },
+ {
+ name: "invalid IntervalSeconds",
+ maxSurgePercent: ptr.To(int32(50)),
+ stepSizePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(0)),
+ gatewayClassName: "istio",
+ enableAutoscaling: true,
+ expectError: true,
+ },
+ {
+ name: "missing GatewayClassName",
+ maxSurgePercent: ptr.To(int32(50)),
+ stepSizePercent: ptr.To(int32(50)),
+ intervalSeconds: ptr.To(int32(10)),
+ enableAutoscaling: true,
+ expectError: true,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ var upgradeStrategy *rayv1.RayServiceUpgradeStrategy
+ if tt.maxSurgePercent != nil || tt.stepSizePercent != nil || tt.intervalSeconds != nil || tt.gatewayClassName != "" {
+ upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade),
+ ClusterUpgradeOptions: &rayv1.ClusterUpgradeOptions{
+ MaxSurgePercent: tt.maxSurgePercent,
+ StepSizePercent: tt.stepSizePercent,
+ IntervalSeconds: tt.intervalSeconds,
+ GatewayClassName: tt.gatewayClassName,
+ },
+ }
+ } else if tt.expectError {
+ upgradeStrategy = &rayv1.RayServiceUpgradeStrategy{
+ Type: ptr.To(rayv1.NewClusterWithIncrementalUpgrade),
+ }
+ }
+
+ rayClusterSpec := *createBasicRayClusterSpec()
+ rayClusterSpec.EnableInTreeAutoscaling = ptr.To(tt.enableAutoscaling)
+
+ rayService := &rayv1.RayService{
+ Spec: rayv1.RayServiceSpec{
+ RayClusterSpec: rayClusterSpec,
+ UpgradeStrategy: upgradeStrategy,
+ },
+ }
+
+ err := ValidateClusterUpgradeOptions(rayService)
+ if tt.expectError {
+ require.Error(t, err, tt.name)
+ } else {
+ require.NoError(t, err, tt.name)
+ }
+ })
+ }
+}
diff --git a/ray-operator/go.mod b/ray-operator/go.mod
index 94d155da29f..78f3870ae24 100644
--- a/ray-operator/go.mod
+++ b/ray-operator/go.mod
@@ -4,22 +4,21 @@ go 1.24.0
require (
github.com/Masterminds/semver/v3 v3.3.1
+ github.com/coder/websocket v1.8.13
github.com/go-logr/logr v1.4.3
github.com/go-logr/zapr v1.3.0
- github.com/google/go-cmp v0.7.0
github.com/jarcoal/httpmock v1.4.0
github.com/onsi/ginkgo/v2 v2.23.4
github.com/onsi/gomega v1.37.0
github.com/openshift/api v0.0.0-20250602203052-b29811a290c7
github.com/orcaman/concurrent-map/v2 v2.0.1
- github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.22.0
+ github.com/spf13/pflag v1.0.6
github.com/stretchr/testify v1.10.0
go.uber.org/mock v0.5.2
go.uber.org/zap v1.27.0
gopkg.in/natefinch/lumberjack.v2 v2.2.1
k8s.io/api v0.33.1
- k8s.io/apiextensions-apiserver v0.33.1
k8s.io/apimachinery v0.33.1
k8s.io/apiserver v0.33.1
k8s.io/client-go v0.33.1
@@ -28,6 +27,7 @@ require (
k8s.io/klog/v2 v2.130.1
k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979
sigs.k8s.io/controller-runtime v0.21.0
+ sigs.k8s.io/gateway-api v1.3.0
sigs.k8s.io/scheduler-plugins v0.31.8
sigs.k8s.io/structured-merge-diff/v4 v4.7.0
sigs.k8s.io/yaml v1.4.0
@@ -38,19 +38,19 @@ require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
- github.com/coder/websocket v1.8.13 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
- github.com/emicklei/go-restful/v3 v3.11.0 // indirect
+ github.com/emicklei/go-restful/v3 v3.12.0 // indirect
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
- github.com/go-openapi/jsonreference v0.20.2 // indirect
+ github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/btree v1.1.3 // indirect
github.com/google/gnostic-models v0.6.9 // indirect
+ github.com/google/go-cmp v0.7.0 // indirect
github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
@@ -62,11 +62,11 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
+ github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.62.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
- github.com/spf13/pflag v1.0.5 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/x448/float16 v0.8.4 // indirect
go.opentelemetry.io/otel v1.33.0 // indirect
@@ -74,19 +74,20 @@ require (
go.uber.org/automaxprocs v1.6.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/mod v0.24.0 // indirect
- golang.org/x/net v0.38.0 // indirect
+ golang.org/x/net v0.39.0 // indirect
golang.org/x/oauth2 v0.27.0 // indirect
- golang.org/x/sync v0.12.0 // indirect
+ golang.org/x/sync v0.13.0 // indirect
golang.org/x/sys v0.32.0 // indirect
- golang.org/x/term v0.30.0 // indirect
- golang.org/x/text v0.23.0 // indirect
+ golang.org/x/term v0.31.0 // indirect
+ golang.org/x/text v0.24.0 // indirect
golang.org/x/time v0.9.0 // indirect
golang.org/x/tools v0.31.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
- google.golang.org/protobuf v1.36.5 // indirect
+ google.golang.org/protobuf v1.36.6 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
+ k8s.io/apiextensions-apiserver v0.33.1 // indirect
k8s.io/gengo/v2 v2.0.0-20250207200755-1244d31929d7 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
diff --git a/ray-operator/go.sum b/ray-operator/go.sum
index 6d6e0b27493..2d1825ab836 100644
--- a/ray-operator/go.sum
+++ b/ray-operator/go.sum
@@ -10,13 +10,12 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/coder/websocket v1.8.13 h1:f3QZdXy7uGVz+4uCJy2nTZyM0yTBj8yANEHhqlXZ9FE=
github.com/coder/websocket v1.8.13/go.mod h1:LNVeNrXQZfe5qhS9ALED3uA+l5pPqvwXg3CKoDBB2gs=
-github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g=
-github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
+github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk=
+github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/evanphx/json-patch v5.6.0+incompatible h1:jBYDEEiFBPxA0v50tFdvOzQQTCvpL6mnFh5mB2/l16U=
github.com/evanphx/json-patch v5.6.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU=
@@ -29,12 +28,10 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
-github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
-github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
-github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
-github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
+github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
+github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4=
github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
@@ -67,11 +64,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
-github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
-github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
-github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
@@ -116,17 +110,12 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
-github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
-github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o=
+github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
-github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
-github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
-github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
-github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
@@ -158,26 +147,26 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
-golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
-golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
+golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY=
+golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E=
golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M=
golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw=
-golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
+golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610=
+golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
-golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y=
-golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g=
+golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o=
+golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
-golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
+golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
+golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=
golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
@@ -192,8 +181,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw=
gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY=
-google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM=
-google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
+google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY=
+google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
@@ -203,7 +192,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc=
gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc=
-gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
k8s.io/api v0.33.1 h1:tA6Cf3bHnLIrUK4IqEgb2v++/GYUtqiu9sRVk3iBXyw=
@@ -230,6 +218,8 @@ k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97
k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8=
sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM=
+sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M=
+sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
diff --git a/ray-operator/main.go b/ray-operator/main.go
index 5666a438733..ceba7d4772e 100644
--- a/ray-operator/main.go
+++ b/ray-operator/main.go
@@ -27,6 +27,7 @@ import (
k8szap "sigs.k8s.io/controller-runtime/pkg/log/zap"
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
configapi "github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
@@ -191,6 +192,10 @@ func main() {
}
features.LogFeatureGates(setupLog)
+ if features.Enabled(features.RayServiceIncrementalUpgrade) {
+ utilruntime.Must(gwv1.AddToScheme(scheme))
+ }
+
// Manager options
options := ctrl.Options{
Cache: cache.Options{
diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/clusterupgradeoptions.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/clusterupgradeoptions.go
new file mode 100644
index 00000000000..1e43d339716
--- /dev/null
+++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/clusterupgradeoptions.go
@@ -0,0 +1,50 @@
+// Code generated by applyconfiguration-gen. DO NOT EDIT.
+
+package v1
+
+// ClusterUpgradeOptionsApplyConfiguration represents a declarative configuration of the ClusterUpgradeOptions type for use
+// with apply.
+type ClusterUpgradeOptionsApplyConfiguration struct {
+ MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"`
+ StepSizePercent *int32 `json:"stepSizePercent,omitempty"`
+ IntervalSeconds *int32 `json:"intervalSeconds,omitempty"`
+ GatewayClassName *string `json:"gatewayClassName,omitempty"`
+}
+
+// ClusterUpgradeOptionsApplyConfiguration constructs a declarative configuration of the ClusterUpgradeOptions type for use with
+// apply.
+func ClusterUpgradeOptions() *ClusterUpgradeOptionsApplyConfiguration {
+ return &ClusterUpgradeOptionsApplyConfiguration{}
+}
+
+// WithMaxSurgePercent sets the MaxSurgePercent field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the MaxSurgePercent field is set to the value of the last call.
+func (b *ClusterUpgradeOptionsApplyConfiguration) WithMaxSurgePercent(value int32) *ClusterUpgradeOptionsApplyConfiguration {
+ b.MaxSurgePercent = &value
+ return b
+}
+
+// WithStepSizePercent sets the StepSizePercent field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the StepSizePercent field is set to the value of the last call.
+func (b *ClusterUpgradeOptionsApplyConfiguration) WithStepSizePercent(value int32) *ClusterUpgradeOptionsApplyConfiguration {
+ b.StepSizePercent = &value
+ return b
+}
+
+// WithIntervalSeconds sets the IntervalSeconds field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the IntervalSeconds field is set to the value of the last call.
+func (b *ClusterUpgradeOptionsApplyConfiguration) WithIntervalSeconds(value int32) *ClusterUpgradeOptionsApplyConfiguration {
+ b.IntervalSeconds = &value
+ return b
+}
+
+// WithGatewayClassName sets the GatewayClassName field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the GatewayClassName field is set to the value of the last call.
+func (b *ClusterUpgradeOptionsApplyConfiguration) WithGatewayClassName(value string) *ClusterUpgradeOptionsApplyConfiguration {
+ b.GatewayClassName = &value
+ return b
+}
diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go
index b0fcd8032bb..2d7f2984cef 100644
--- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go
+++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayservicestatus.go
@@ -2,12 +2,19 @@
package v1
+import (
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
// RayServiceStatusApplyConfiguration represents a declarative configuration of the RayServiceStatus type for use
// with apply.
type RayServiceStatusApplyConfiguration struct {
- Applications map[string]AppStatusApplyConfiguration `json:"applicationStatuses,omitempty"`
- RayClusterName *string `json:"rayClusterName,omitempty"`
- RayClusterStatus *RayClusterStatusApplyConfiguration `json:"rayClusterStatus,omitempty"`
+ Applications map[string]AppStatusApplyConfiguration `json:"applicationStatuses,omitempty"`
+ TargetCapacity *int32 `json:"targetCapacity,omitempty"`
+ TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"`
+ LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"`
+ RayClusterName *string `json:"rayClusterName,omitempty"`
+ RayClusterStatus *RayClusterStatusApplyConfiguration `json:"rayClusterStatus,omitempty"`
}
// RayServiceStatusApplyConfiguration constructs a declarative configuration of the RayServiceStatus type for use with
@@ -30,6 +37,30 @@ func (b *RayServiceStatusApplyConfiguration) WithApplications(entries map[string
return b
}
+// WithTargetCapacity sets the TargetCapacity field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the TargetCapacity field is set to the value of the last call.
+func (b *RayServiceStatusApplyConfiguration) WithTargetCapacity(value int32) *RayServiceStatusApplyConfiguration {
+ b.TargetCapacity = &value
+ return b
+}
+
+// WithTrafficRoutedPercent sets the TrafficRoutedPercent field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the TrafficRoutedPercent field is set to the value of the last call.
+func (b *RayServiceStatusApplyConfiguration) WithTrafficRoutedPercent(value int32) *RayServiceStatusApplyConfiguration {
+ b.TrafficRoutedPercent = &value
+ return b
+}
+
+// WithLastTrafficMigratedTime sets the LastTrafficMigratedTime field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the LastTrafficMigratedTime field is set to the value of the last call.
+func (b *RayServiceStatusApplyConfiguration) WithLastTrafficMigratedTime(value metav1.Time) *RayServiceStatusApplyConfiguration {
+ b.LastTrafficMigratedTime = &value
+ return b
+}
+
// WithRayClusterName sets the RayClusterName field in the declarative configuration to the given value
// and returns the receiver, so that objects can be built by chaining "With" function invocations.
// If called multiple times, the RayClusterName field is set to the value of the last call.
diff --git a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go
index 361a98f6ac9..c8cfc02aed6 100644
--- a/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go
+++ b/ray-operator/pkg/client/applyconfiguration/ray/v1/rayserviceupgradestrategy.go
@@ -9,7 +9,8 @@ import (
// RayServiceUpgradeStrategyApplyConfiguration represents a declarative configuration of the RayServiceUpgradeStrategy type for use
// with apply.
type RayServiceUpgradeStrategyApplyConfiguration struct {
- Type *rayv1.RayServiceUpgradeType `json:"type,omitempty"`
+ Type *rayv1.RayServiceUpgradeType `json:"type,omitempty"`
+ ClusterUpgradeOptions *ClusterUpgradeOptionsApplyConfiguration `json:"clusterUpgradeOptions,omitempty"`
}
// RayServiceUpgradeStrategyApplyConfiguration constructs a declarative configuration of the RayServiceUpgradeStrategy type for use with
@@ -25,3 +26,11 @@ func (b *RayServiceUpgradeStrategyApplyConfiguration) WithType(value rayv1.RaySe
b.Type = &value
return b
}
+
+// WithClusterUpgradeOptions sets the ClusterUpgradeOptions field in the declarative configuration to the given value
+// and returns the receiver, so that objects can be built by chaining "With" function invocations.
+// If called multiple times, the ClusterUpgradeOptions field is set to the value of the last call.
+func (b *RayServiceUpgradeStrategyApplyConfiguration) WithClusterUpgradeOptions(value *ClusterUpgradeOptionsApplyConfiguration) *RayServiceUpgradeStrategyApplyConfiguration {
+ b.ClusterUpgradeOptions = value
+ return b
+}
diff --git a/ray-operator/pkg/client/applyconfiguration/utils.go b/ray-operator/pkg/client/applyconfiguration/utils.go
index 050733b0c5e..feecbde7f06 100644
--- a/ray-operator/pkg/client/applyconfiguration/utils.go
+++ b/ray-operator/pkg/client/applyconfiguration/utils.go
@@ -20,6 +20,8 @@ func ForKind(kind schema.GroupVersionKind) interface{} {
return &rayv1.AppStatusApplyConfiguration{}
case v1.SchemeGroupVersion.WithKind("AutoscalerOptions"):
return &rayv1.AutoscalerOptionsApplyConfiguration{}
+ case v1.SchemeGroupVersion.WithKind("ClusterUpgradeOptions"):
+ return &rayv1.ClusterUpgradeOptionsApplyConfiguration{}
case v1.SchemeGroupVersion.WithKind("DeletionCondition"):
return &rayv1.DeletionConditionApplyConfiguration{}
case v1.SchemeGroupVersion.WithKind("DeletionPolicy"):
diff --git a/ray-operator/pkg/features/features.go b/ray-operator/pkg/features/features.go
index 5aedc155c81..16b23ab83ac 100644
--- a/ray-operator/pkg/features/features.go
+++ b/ray-operator/pkg/features/features.go
@@ -30,6 +30,13 @@ const (
// alpha: v1.0
// Enables multi-host worker indexing
RayMultiHostIndexing featuregate.Feature = "RayMultiHostIndexing"
+
+ // owner: @ryanaoleary
+ // rep: N/A
+ // alpha: v1.0
+ //
+ // Enabled NewClusterWithIncrementalUpgrade type for RayService zero-downtime upgrades.
+ RayServiceIncrementalUpgrade featuregate.Feature = "RayServiceIncrementalUpgrade"
)
func init() {
@@ -37,9 +44,10 @@ func init() {
}
var defaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
- RayClusterStatusConditions: {Default: true, PreRelease: featuregate.Beta},
- RayJobDeletionPolicy: {Default: false, PreRelease: featuregate.Alpha},
- RayMultiHostIndexing: {Default: false, PreRelease: featuregate.Alpha},
+ RayClusterStatusConditions: {Default: true, PreRelease: featuregate.Beta},
+ RayJobDeletionPolicy: {Default: false, PreRelease: featuregate.Alpha},
+ RayMultiHostIndexing: {Default: false, PreRelease: featuregate.Alpha},
+ RayServiceIncrementalUpgrade: {Default: false, PreRelease: featuregate.Alpha},
}
// SetFeatureGateDuringTest is a helper method to override feature gates in tests.
diff --git a/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go
new file mode 100644
index 00000000000..e9290bedda3
--- /dev/null
+++ b/ray-operator/test/e2eincrementalupgrade/rayservice_incremental_upgrade_test.go
@@ -0,0 +1,218 @@
+package e2eincrementalupgrade
+
+import (
+ "fmt"
+ "strings"
+ "testing"
+
+ . "github.com/onsi/gomega"
+ corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/utils/ptr"
+
+ rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+ "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
+ rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1"
+ "github.com/ray-project/kuberay/ray-operator/pkg/features"
+ . "github.com/ray-project/kuberay/ray-operator/test/support"
+)
+
+// helper function to get RayCluster head service external IP to use to poll the RayService
+func GetHeadServiceExternalIP(t *testing.T, clusterName, namespace string) (string, error) {
+ test := With(t)
+
+ svc, err := test.Client().Core().CoreV1().Services(namespace).Get(test.Ctx(), clusterName+"-head-svc", metav1.GetOptions{})
+ if err != nil {
+ return "", err
+ }
+ if len(svc.Status.LoadBalancer.Ingress) == 0 {
+ return "", fmt.Errorf("no ingress for service %s", svc.Name)
+ }
+ return svc.Status.LoadBalancer.Ingress[0].IP, nil
+}
+
+func TestRayServiceIncrementalUpgrade(t *testing.T) {
+ features.SetFeatureGateDuringTest(t, features.RayServiceIncrementalUpgrade, true)
+
+ test := With(t)
+ g := NewWithT(t)
+
+ namespace := test.NewTestNamespace()
+ rayServiceName := "incremental-rayservice"
+
+ // Create a RayService with IncrementalUpgrade enabled
+ stepSize := ptr.To(int32(25))
+ interval := ptr.To(int32(5))
+ maxSurge := ptr.To(int32(50))
+
+ rayServiceAC := rayv1ac.RayService(rayServiceName, namespace.Name).
+ WithSpec(IncrementalUpgradeRayServiceApplyConfiguration(stepSize, interval, maxSurge))
+ rayService, err := test.Client().Ray().RayV1().RayServices(namespace.Name).Apply(test.Ctx(), rayServiceAC, TestApplyOptions)
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(rayService).NotTo(BeNil())
+
+ LogWithTimestamp(test.T(), "Waiting for RayService %s/%s to be ready", rayService.Namespace, rayService.Name)
+ g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutMedium).
+ Should(WithTransform(IsRayServiceReady, BeTrue()))
+
+ rayService, err = GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+
+ // Validate Gateway and HTTPRoute objects have been created for incremental upgrade.
+ gatewayName := fmt.Sprintf("%s-%s", rayServiceName, "gateway")
+ LogWithTimestamp(test.T(), "Waiting for Gateway %s/%s to be ready", rayService.Namespace, gatewayName)
+ g.Eventually(Gateway(test, rayService.Namespace, gatewayName), TestTimeoutMedium).
+ Should(WithTransform(utils.IsGatewayReady, BeTrue()))
+
+ // Get the Gateway endpoint to send requests to
+ gateway, err := GetGateway(test, namespace.Name, fmt.Sprintf("%s-%s", rayServiceName, "gateway"))
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(gateway).NotTo(BeNil())
+
+ httpRouteName := fmt.Sprintf("%s-%s", rayServiceName, "httproute")
+ LogWithTimestamp(test.T(), "Waiting for HTTPRoute %s/%s to be ready", rayService.Namespace, httpRouteName)
+ g.Eventually(HTTPRoute(test, rayService.Namespace, httpRouteName), TestTimeoutMedium).
+ Should(Not(BeNil()))
+
+ httpRoute, err := GetHTTPRoute(test, namespace.Name, httpRouteName)
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(utils.IsHTTPRouteReady(gateway, httpRoute)).To(BeTrue())
+
+ // Create curl pod to test traffic routing through Gateway to RayService
+ curlPodName := "curl-pod"
+ curlContainerName := "curl-container"
+ curlPod, err := CreateCurlPod(g, test, curlPodName, curlContainerName, namespace.Name)
+ g.Expect(err).NotTo(HaveOccurred())
+
+ LogWithTimestamp(test.T(), "Waiting for Curl Pod %s to be ready", curlPodName)
+ g.Eventually(func(g Gomega) *corev1.Pod {
+ updatedPod, err := test.Client().Core().CoreV1().Pods(curlPod.Namespace).Get(test.Ctx(), curlPod.Name, metav1.GetOptions{})
+ g.Expect(err).NotTo(HaveOccurred())
+ return updatedPod
+ }, TestTimeoutShort).Should(WithTransform(IsPodRunningAndReady, BeTrue()))
+
+ gatewayIP := GetGatewayIP(gateway)
+ g.Expect(gatewayIP).NotTo(BeEmpty())
+
+ LogWithTimestamp(test.T(), "Verifying RayService is serving traffic")
+ stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`)
+ g.Expect(stdout.String()).To(Equal("6"))
+ stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/calc", `["MUL", 3]`)
+ g.Expect(stdout.String()).To(Equal("15 pizzas please!"))
+
+ // Attempt to trigger NewClusterWithIncrementalUpgrade by updating RayService serve config and RayCluster spec
+ g.Eventually(func() error {
+ latestRayService, err := GetRayService(test, namespace.Name, rayServiceName)
+ if err != nil {
+ return err
+ }
+ latestRayService.Spec.RayClusterSpec.WorkerGroupSpecs[0].Template.Spec.Containers[0].Resources.Requests[corev1.ResourceCPU] = resource.MustParse("500m")
+ serveConfig := latestRayService.Spec.ServeConfigV2
+ serveConfig = strings.Replace(serveConfig, "price: 3", "price: 4", -1)
+ serveConfig = strings.Replace(serveConfig, "factor: 5", "factor: 3", -1)
+ latestRayService.Spec.ServeConfigV2 = serveConfig
+
+ _, err = test.Client().Ray().RayV1().RayServices(namespace.Name).Update(
+ test.Ctx(),
+ latestRayService,
+ metav1.UpdateOptions{},
+ )
+ return err
+ }, TestTimeoutShort).Should(Succeed(), "Failed to update RayService to trigger upgrade")
+
+ LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be true", rayService.Namespace, rayService.Name)
+ g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeTrue()))
+
+ LogWithTimestamp(test.T(), "Verifying temporary service creation and HTTPRoute backends")
+ upgradingRaySvc, err := GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+ activeClusterName := upgradingRaySvc.Status.ActiveServiceStatus.RayClusterName
+ g.Expect(activeClusterName).NotTo(BeEmpty(), "The active cluster should be set when a RayService is ready.")
+ pendingClusterName := upgradingRaySvc.Status.PendingServiceStatus.RayClusterName
+ g.Expect(pendingClusterName).NotTo(BeEmpty(), "The controller should have created a pending cluster.")
+
+ // Validate serve service for the active cluster exists.
+ activeServeSvcName := utils.GenerateServeServiceName(activeClusterName)
+ _, err = test.Client().Core().CoreV1().Services(namespace.Name).Get(test.Ctx(), activeServeSvcName, metav1.GetOptions{})
+ g.Expect(err).NotTo(HaveOccurred(), "The serve service for the active cluster should be created.")
+
+ // Validate serve service for the pending cluster has been created for the upgrade.
+ pendingServeSvcName := utils.GenerateServeServiceName(pendingClusterName)
+ g.Eventually(func(g Gomega) {
+ _, err = test.Client().Core().CoreV1().Services(namespace.Name).Get(test.Ctx(), pendingServeSvcName, metav1.GetOptions{})
+ g.Expect(err).NotTo(HaveOccurred(), "The serve service for the pending cluster should be created.")
+ }, TestTimeoutShort).Should(Succeed())
+
+ LogWithTimestamp(test.T(), "Waiting for pending RayCluster %s to have a ready head pod", pendingClusterName)
+ g.Eventually(RayCluster(test, namespace.Name, pendingClusterName), TestTimeoutMedium).
+ Should(WithTransform(StatusCondition(rayv1.HeadPodReady), MatchCondition(metav1.ConditionTrue, rayv1.HeadPodRunningAndReady)))
+
+ // Wait for the HTTPRoute to reflect the two backends.
+ LogWithTimestamp(test.T(), "Waiting for HTTPRoute to have two backends")
+ g.Eventually(func(g Gomega) {
+ route, err := GetHTTPRoute(test, namespace.Name, httpRouteName)
+ g.Expect(err).NotTo(HaveOccurred())
+ g.Expect(route.Spec.Rules).To(HaveLen(1))
+ g.Expect(route.Spec.Rules[0].BackendRefs).To(HaveLen(2))
+ g.Expect(string(route.Spec.Rules[0].BackendRefs[1].Name)).To(Equal(pendingServeSvcName))
+ }, TestTimeoutShort).Should(Succeed())
+
+ LogWithTimestamp(test.T(), "Validating stepwise traffic and capacity migration")
+ intervalSeconds := *interval
+ var lastMigratedTime *metav1.Time
+ oldVersionServed := false
+ newVersionServed := false
+
+ // Validate expected behavior during an IncrementalUpgrade. The following checks ensures
+ // that no requests are dropped throughout the upgrade process.
+ upgradeSteps := generateUpgradeSteps(*stepSize, *maxSurge)
+ for _, step := range upgradeSteps {
+ LogWithTimestamp(test.T(), "%s", step.name)
+ g.Eventually(func(g Gomega) int32 {
+ // Fetch updated RayService.
+ svc, err := GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+ return step.getValue(svc)
+ }, TestTimeoutShort).Should(Equal(step.expectedValue))
+
+ // Send a request to the RayService to validate no requests are dropped. Check that
+ // both endpoints are serving requests.
+ stdout, _ := CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`)
+ response := stdout.String()
+ g.Expect(response).To(Or(Equal("6"), Equal("8")), "Response should be from the old or new app version during the upgrade")
+ if response == "6" {
+ oldVersionServed = true
+ }
+ if response == "8" {
+ newVersionServed = true
+ }
+
+ if strings.Contains(step.name, "pending traffic to shift") {
+ svc, err := GetRayService(test, namespace.Name, rayServiceName)
+ g.Expect(err).NotTo(HaveOccurred())
+
+ currentMigratedTime := svc.Status.PendingServiceStatus.LastTrafficMigratedTime
+ g.Expect(currentMigratedTime).NotTo(BeNil())
+
+ // Verify IntervalSeconds have passed since last TrafficRoutedPercent update.
+ if lastMigratedTime != nil {
+ duration := currentMigratedTime.Sub(lastMigratedTime.Time)
+ g.Expect(duration).To(BeNumerically(">=", intervalSeconds),
+ "Time between traffic steps should be >= IntervalSeconds")
+ }
+ lastMigratedTime = currentMigratedTime
+ }
+ }
+ LogWithTimestamp(test.T(), "Verifying both old and new versions served traffic during the upgrade")
+ g.Expect(oldVersionServed).To(BeTrue(), "The old version of the service should have served traffic during the upgrade.")
+ g.Expect(newVersionServed).To(BeTrue(), "The new version of the service should have served traffic during the upgrade.")
+
+ // Check that RayService completed upgrade
+ LogWithTimestamp(test.T(), "Waiting for RayService %s/%s UpgradeInProgress condition to be false", rayService.Namespace, rayService.Name)
+ g.Eventually(RayService(test, rayService.Namespace, rayService.Name), TestTimeoutShort).Should(WithTransform(IsRayServiceUpgrading, BeFalse()))
+
+ LogWithTimestamp(test.T(), "Verifying RayService uses updated ServeConfig after upgrade completes")
+ stdout, _ = CurlRayServiceGateway(test, gatewayIP, curlPod, curlContainerName, "/fruit", `["MANGO", 2]`)
+ g.Expect(stdout.String()).To(Equal("8"))
+}
diff --git a/ray-operator/test/e2eincrementalupgrade/support.go b/ray-operator/test/e2eincrementalupgrade/support.go
new file mode 100644
index 00000000000..b5e6293f491
--- /dev/null
+++ b/ray-operator/test/e2eincrementalupgrade/support.go
@@ -0,0 +1,247 @@
+package e2eincrementalupgrade
+
+import (
+ "bytes"
+ "fmt"
+
+ corev1 "k8s.io/api/core/v1"
+ "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ corev1ac "k8s.io/client-go/applyconfigurations/core/v1"
+ "k8s.io/utils/ptr"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
+
+ rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+ "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
+ rayv1ac "github.com/ray-project/kuberay/ray-operator/pkg/client/applyconfiguration/ray/v1"
+ . "github.com/ray-project/kuberay/ray-operator/test/support"
+)
+
+func CurlRayServiceGateway(
+ t Test,
+ gatewayIP string,
+ curlPod *corev1.Pod,
+ curlPodContainerName,
+ rayServicePath,
+ body string,
+) (bytes.Buffer, bytes.Buffer) {
+ cmd := []string{
+ "curl",
+ "--max-time", "10",
+ "-X", "POST",
+ "-H", "Connection: close", // avoid re-using the same connection for test
+ "-H", "Content-Type: application/json",
+ fmt.Sprintf("http://%s%s", gatewayIP, rayServicePath),
+ "-d", body,
+ }
+
+ return ExecPodCmd(t, curlPod, curlPodContainerName, cmd)
+}
+
+func IncrementalUpgradeRayServiceApplyConfiguration(
+ stepSizePercent, intervalSeconds, maxSurgePercent *int32,
+) *rayv1ac.RayServiceSpecApplyConfiguration {
+ return rayv1ac.RayServiceSpec().
+ WithUpgradeStrategy(rayv1ac.RayServiceUpgradeStrategy().
+ WithType(rayv1.NewClusterWithIncrementalUpgrade).
+ WithClusterUpgradeOptions(
+ rayv1ac.ClusterUpgradeOptions().
+ WithGatewayClassName("istio").
+ WithStepSizePercent(*stepSizePercent).
+ WithIntervalSeconds(*intervalSeconds).
+ WithMaxSurgePercent(*maxSurgePercent),
+ )).
+ WithServeConfigV2(`applications:
+ - name: fruit_app
+ import_path: fruit.deployment_graph
+ route_prefix: /fruit
+ runtime_env:
+ working_dir: "https://github.com/ray-project/test_dag/archive/78b4a5da38796123d9f9ffff59bab2792a043e95.zip"
+ deployments:
+ - name: MangoStand
+ num_replicas: 1
+ user_config:
+ price: 3
+ ray_actor_options:
+ num_cpus: 0.1
+ - name: OrangeStand
+ num_replicas: 1
+ user_config:
+ price: 2
+ ray_actor_options:
+ num_cpus: 0.1
+ - name: FruitMarket
+ num_replicas: 1
+ ray_actor_options:
+ num_cpus: 0.1
+ - name: math_app
+ import_path: conditional_dag.serve_dag
+ route_prefix: /calc
+ runtime_env:
+ working_dir: "https://github.com/ray-project/test_dag/archive/78b4a5da38796123d9f9ffff59bab2792a043e95.zip"
+ deployments:
+ - name: Adder
+ num_replicas: 1
+ user_config:
+ increment: 3
+ ray_actor_options:
+ num_cpus: 0.1
+ - name: Multiplier
+ num_replicas: 1
+ user_config:
+ factor: 5
+ ray_actor_options:
+ num_cpus: 0.1
+ - name: Router
+ ray_actor_options:
+ num_cpus: 0.1
+ num_replicas: 1`).
+ WithRayClusterSpec(rayv1ac.RayClusterSpec().
+ WithRayVersion(GetRayVersion()).
+ WithEnableInTreeAutoscaling(true).
+ WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
+ WithRayStartParams(map[string]string{"dashboard-host": "0.0.0.0"}).
+ WithTemplate(corev1ac.PodTemplateSpec().
+ WithSpec(corev1ac.PodSpec().
+ WithRestartPolicy(corev1.RestartPolicyNever).
+ WithContainers(corev1ac.Container().
+ WithName("ray-head").
+ WithImage(GetRayImage()).
+ WithEnv(corev1ac.EnvVar().WithName(utils.RAY_ENABLE_AUTOSCALER_V2).WithValue("1")).
+ WithPorts(
+ corev1ac.ContainerPort().WithName(utils.GcsServerPortName).WithContainerPort(utils.DefaultGcsServerPort),
+ corev1ac.ContainerPort().WithName(utils.ServingPortName).WithContainerPort(utils.DefaultServingPort),
+ corev1ac.ContainerPort().WithName(utils.DashboardPortName).WithContainerPort(utils.DefaultDashboardPort),
+ corev1ac.ContainerPort().WithName(utils.ClientPortName).WithContainerPort(utils.DefaultClientPort),
+ ).
+ WithResources(corev1ac.ResourceRequirements().
+ WithRequests(corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("2"),
+ corev1.ResourceMemory: resource.MustParse("3Gi"),
+ }).
+ WithLimits(corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("2"),
+ corev1.ResourceMemory: resource.MustParse("3Gi"),
+ })))))).
+ WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec().
+ WithReplicas(1).
+ WithMinReplicas(1).
+ WithMaxReplicas(4).
+ WithRayStartParams(map[string]string{"num-cpus": "1"}).
+ WithGroupName("small-group").
+ WithTemplate(corev1ac.PodTemplateSpec().
+ WithSpec(corev1ac.PodSpec().
+ WithRestartPolicy(corev1.RestartPolicyNever).
+ WithContainers(corev1ac.Container().
+ WithName("ray-worker").
+ WithImage(GetRayImage()).
+ WithResources(corev1ac.ResourceRequirements().
+ WithRequests(corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("300m"),
+ corev1.ResourceMemory: resource.MustParse("1G"),
+ }).
+ WithLimits(corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("500m"),
+ corev1.ResourceMemory: resource.MustParse("1G"),
+ })))))),
+ )
+}
+
+// GetGatewayIP retrieves the external IP for a Gateway object
+func GetGatewayIP(gateway *gwv1.Gateway) string {
+ if gateway == nil {
+ return ""
+ }
+ for _, addr := range gateway.Status.Addresses {
+ if addr.Type == nil || *addr.Type == gwv1.IPAddressType {
+ return addr.Value
+ }
+ }
+
+ return ""
+}
+
+func GetPendingCapacity(rs *rayv1.RayService) int32 {
+ return ptr.Deref(rs.Status.PendingServiceStatus.TargetCapacity, 0)
+}
+
+func GetPendingTraffic(rs *rayv1.RayService) int32 {
+ return ptr.Deref(rs.Status.PendingServiceStatus.TrafficRoutedPercent, 0)
+}
+
+func GetActiveCapacity(rs *rayv1.RayService) int32 {
+ return ptr.Deref(rs.Status.ActiveServiceStatus.TargetCapacity, 100)
+}
+
+func GetActiveTraffic(rs *rayv1.RayService) int32 {
+ return ptr.Deref(rs.Status.ActiveServiceStatus.TrafficRoutedPercent, 100)
+}
+
+func GetLastTrafficMigratedTime(rs *rayv1.RayService) *metav1.Time {
+ return rs.Status.ActiveServiceStatus.LastTrafficMigratedTime
+}
+
+// testStep defines a validation condition to wait for during the upgrade.
+type testStep struct {
+ getValue func(rs *rayv1.RayService) int32
+ name string
+ expectedValue int32
+}
+
+// generateUpgradeSteps is a helper function for testing that the controller follows the expected
+// sequence of updates to TrafficRoutedPercent and TargetCapacity during an incremental upgrade.
+func generateUpgradeSteps(stepSize, maxSurge int32) []testStep {
+ var steps []testStep
+
+ pendingCapacity := int32(0)
+ pendingTraffic := int32(0)
+ activeCapacity := int32(100)
+ activeTraffic := int32(100)
+
+ for pendingTraffic < 100 {
+ // Scale up the pending cluster's TargetCapacity.
+ if pendingTraffic == pendingCapacity {
+ nextPendingCapacity := min(pendingCapacity+maxSurge, 100)
+ if nextPendingCapacity > pendingCapacity {
+ steps = append(steps, testStep{
+ name: fmt.Sprintf("Waiting for pending capacity to scale up to %d", nextPendingCapacity),
+ getValue: GetPendingCapacity,
+ expectedValue: nextPendingCapacity,
+ })
+ pendingCapacity = nextPendingCapacity
+ }
+ }
+
+ // Shift traffic over from the active to the pending cluster by StepSizePercent.
+ for pendingTraffic < pendingCapacity {
+ nextPendingTraffic := min(pendingTraffic+stepSize, 100)
+ steps = append(steps, testStep{
+ name: fmt.Sprintf("Waiting for pending traffic to shift to %d", nextPendingTraffic),
+ getValue: GetPendingTraffic,
+ expectedValue: nextPendingTraffic,
+ })
+ pendingTraffic = nextPendingTraffic
+
+ nextActiveTraffic := max(activeTraffic-stepSize, 0)
+ steps = append(steps, testStep{
+ name: fmt.Sprintf("Waiting for active traffic to shift down to %d", nextActiveTraffic),
+ getValue: GetActiveTraffic,
+ expectedValue: nextActiveTraffic,
+ })
+ activeTraffic = nextActiveTraffic
+ }
+
+ // Scale down the active cluster's target capacity. The final scale
+ // down is when the pending cluster is promoted to active.
+ nextActiveCapacity := max(activeCapacity-maxSurge, 0)
+ if nextActiveCapacity < activeCapacity && nextActiveCapacity > 0 {
+ steps = append(steps, testStep{
+ name: fmt.Sprintf("Waiting for active capacity to scale down to %d", nextActiveCapacity),
+ getValue: GetActiveCapacity,
+ expectedValue: nextActiveCapacity,
+ })
+ activeCapacity = nextActiveCapacity
+ }
+ }
+ return steps
+}
diff --git a/ray-operator/test/support/client.go b/ray-operator/test/support/client.go
index 2e313483966..4925184d46b 100644
--- a/ray-operator/test/support/client.go
+++ b/ray-operator/test/support/client.go
@@ -8,6 +8,7 @@ import (
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
+ gatewayclient "sigs.k8s.io/gateway-api/pkg/client/clientset/versioned"
rayclient "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned"
)
@@ -17,6 +18,7 @@ type Client interface {
Ray() rayclient.Interface
Dynamic() dynamic.Interface
Config() rest.Config
+ Gateway() gatewayclient.Interface
}
type testClient struct {
@@ -24,6 +26,7 @@ type testClient struct {
ray rayclient.Interface
dynamic dynamic.Interface
config rest.Config
+ gateway gatewayclient.Interface
}
var _ Client = (*testClient)(nil)
@@ -44,6 +47,10 @@ func (t *testClient) Config() rest.Config {
return t.config
}
+func (t *testClient) Gateway() gatewayclient.Interface {
+ return t.gateway
+}
+
func newTestClient() (Client, error) {
cfg, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(
clientcmd.NewDefaultClientConfigLoadingRules(),
@@ -68,10 +75,16 @@ func newTestClient() (Client, error) {
return nil, err
}
+ gatewayClient, err := gatewayclient.NewForConfig(cfg)
+ if err != nil {
+ return nil, err
+ }
+
return &testClient{
core: kubeClient,
ray: rayClient,
dynamic: dynamicClient,
config: *cfg,
+ gateway: gatewayClient,
}, nil
}
diff --git a/ray-operator/test/support/ray.go b/ray-operator/test/support/ray.go
index ffea3c75d87..0b5c525abcf 100644
--- a/ray-operator/test/support/ray.go
+++ b/ray-operator/test/support/ray.go
@@ -9,6 +9,7 @@ import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ gwv1 "sigs.k8s.io/gateway-api/apis/v1"
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
@@ -226,3 +227,23 @@ func GetRayClusterWorkerGroupReplicaSum(cluster *rayv1.RayCluster) int32 {
}
return replicas
}
+
+func GetHTTPRoute(t Test, namespace, name string) (*gwv1.HTTPRoute, error) {
+ return t.Client().Gateway().GatewayV1().HTTPRoutes(namespace).Get(t.Ctx(), name, metav1.GetOptions{})
+}
+
+func HTTPRoute(t Test, namespace, name string) func() (*gwv1.HTTPRoute, error) {
+ return func() (*gwv1.HTTPRoute, error) {
+ return GetHTTPRoute(t, namespace, name)
+ }
+}
+
+func GetGateway(t Test, namespace, name string) (*gwv1.Gateway, error) {
+ return t.Client().Gateway().GatewayV1().Gateways(namespace).Get(t.Ctx(), name, metav1.GetOptions{})
+}
+
+func Gateway(t Test, namespace, name string) func() (*gwv1.Gateway, error) {
+ return func() (*gwv1.Gateway, error) {
+ return GetGateway(t, namespace, name)
+ }
+}