Skip to content
Merged
Show file tree
Hide file tree
Changes from 50 commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
5b9bc1d
Add incremental upgrade API changes to KubeRay
ryanaoleary Mar 4, 2025
6e880ff
Fix some tests and create Gateway for pending cluster
ryanaoleary Jun 4, 2025
d6781f8
Fix merge errors
ryanaoleary Jun 4, 2025
21bc32a
Manually sync rbac for gateway
ryanaoleary Jun 4, 2025
f66b3a3
Fix bugs and e2e test
ryanaoleary Jun 4, 2025
0c2e82c
Add Makefile command
ryanaoleary Jun 4, 2025
f72e6e8
Run 'make sync'
ryanaoleary Jun 4, 2025
c25a3fe
Run 'make generate'
ryanaoleary Jun 4, 2025
5bd9ac1
Fix comments
ryanaoleary Jun 4, 2025
9551928
Run 'make api-docs'
ryanaoleary Jun 4, 2025
14e73c5
Fix tests after merge conflicts
ryanaoleary Sep 16, 2025
424d4a0
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Sep 23, 2025
9d6070b
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Sep 23, 2025
5ceae50
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Sep 23, 2025
dc5018f
Fix error return
ryanaoleary Sep 29, 2025
e7af14b
Add RayServiceIncrementalUpgrade feature gate option to helm
ryanaoleary Sep 29, 2025
bdcd401
Remove unnecessary perms
ryanaoleary Sep 29, 2025
0d145bc
Remove delete perm and run lint
ryanaoleary Sep 29, 2025
ebbd280
Fix helm roles
ryanaoleary Sep 29, 2025
7ac3371
add back required perms
ryanaoleary Sep 30, 2025
fd5a657
Update ray-operator/controllers/ray/utils/validation.go
ryanaoleary Oct 1, 2025
a87fb1e
Update ray-operator/controllers/ray/utils/util.go
ryanaoleary Oct 1, 2025
5e09293
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Oct 1, 2025
3bc8ab5
Change controller to use two serve services during upgrade
ryanaoleary Oct 1, 2025
df4e4fd
Remove Gateway and HTTPRoute API fields
ryanaoleary Oct 1, 2025
f665353
Fix port errors
ryanaoleary Oct 1, 2025
f5fb7ae
Fix comments and build issues
ryanaoleary Oct 2, 2025
a553b1e
fix helm-chart-verify-rbac
ryanaoleary Oct 2, 2025
7e231db
Refactor tests and create HTTPRoute to be clearer
ryanaoleary Oct 3, 2025
acdcc8a
Use time &now
ryanaoleary Oct 3, 2025
44faa8e
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Oct 3, 2025
cbb1f25
Add function comments
ryanaoleary Oct 3, 2025
3cd620f
Fix bad merge
ryanaoleary Oct 3, 2025
64661fe
Add more comments
ryanaoleary Oct 3, 2025
33060da
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Oct 4, 2025
e89c1b4
Add Ray Serve hostname and serve port logic
ryanaoleary Oct 5, 2025
023fd6c
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Oct 7, 2025
629d0b6
Update ray-operator/controllers/ray/common/service.go
ryanaoleary Oct 7, 2025
65156a6
Update ray-operator/controllers/ray/common/service.go
ryanaoleary Oct 7, 2025
c19068b
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Oct 7, 2025
5d953a8
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Oct 7, 2025
3af80d6
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Oct 7, 2025
c68c4cf
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Oct 7, 2025
8e1ade4
Fix dropped requests and old cluster config not being served
ryanaoleary Oct 9, 2025
f94b996
Resolve readability comments and improve structure
ryanaoleary Oct 10, 2025
7f20ecb
Refactor based on comments
ryanaoleary Oct 15, 2025
e380bcc
Update ray-operator/controllers/ray/common/service.go
ryanaoleary Oct 18, 2025
4736990
Remove hostname from listener
ryanaoleary Oct 16, 2025
638cbea
ensure pending cluster scales from 0 target_capacity
ryanaoleary Oct 20, 2025
7f88d2f
Run make generate after rebase
ryanaoleary Oct 20, 2025
b61b91f
rename upgrade type
ryanaoleary Oct 20, 2025
71f19a9
Clean up utils and add more comments
ryanaoleary Oct 22, 2025
c23f901
reconcileHTTPRoute should pass created object to calculate status
ryanaoleary Oct 22, 2025
e935156
Merge branch 'master' into incremental-upgrade
ryanaoleary Oct 23, 2025
f04fee1
lint
ryanaoleary Oct 23, 2025
18be954
Update ray-operator/controllers/ray/rayservice_controller.go
ryanaoleary Oct 23, 2025
2073680
Fix test after suggested fix
ryanaoleary Oct 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,25 @@ _Appears in:_



#### ClusterUpgradeOptions



These options are currently only supported for the IncrementalUpgrade type.



_Appears in:_
- [RayServiceUpgradeStrategy](#rayserviceupgradestrategy)

| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `maxSurgePercent` _integer_ | The capacity of serve requests the upgraded cluster should scale to handle each interval.<br />Defaults to 100%. | 100 | |
| `stepSizePercent` _integer_ | The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent. | | |
| `intervalSeconds` _integer_ | The interval in seconds between transferring StepSize traffic from the old to new RayCluster. | | |
| `gatewayClassName` _string_ | The name of the Gateway Class installed by the Kubernetes Cluster admin. | | |


#### DeletionCondition


Expand Down Expand Up @@ -377,6 +396,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | |
| `clusterUpgradeOptions` _[ClusterUpgradeOptions](#clusterupgradeoptions)_ | ClusterUpgradeOptions defines the behavior of an IncrementalUpgrade.<br />RayServiceIncrementalUpgrade feature gate must be enabled to set ClusterUpgradeOptions. | | |


#### RayServiceUpgradeType
Expand Down
11 changes: 6 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ require (
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mitchellh/go-wordwrap v1.0.1 // indirect
github.com/moby/spdystream v0.5.0 // indirect
github.com/moby/term v0.5.0 // indirect
Expand All @@ -95,12 +95,12 @@ require (
go.uber.org/automaxprocs v1.6.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/net v0.38.0 // indirect
golang.org/x/net v0.39.0 // indirect
golang.org/x/oauth2 v0.27.0 // indirect
golang.org/x/sync v0.12.0 // indirect
golang.org/x/sync v0.13.0 // indirect
golang.org/x/sys v0.32.0 // indirect
golang.org/x/term v0.30.0 // indirect
golang.org/x/text v0.23.0 // indirect
golang.org/x/term v0.31.0 // indirect
golang.org/x/text v0.24.0 // indirect
golang.org/x/time v0.10.0 // indirect
golang.org/x/tools v0.31.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
Expand All @@ -112,6 +112,7 @@ require (
k8s.io/component-base v0.33.1 // indirect
k8s.io/component-helpers v0.33.1 // indirect
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect
sigs.k8s.io/gateway-api v1.3.0 // indirect
sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect
sigs.k8s.io/kustomize/api v0.19.0 // indirect
sigs.k8s.io/kustomize/kyaml v0.19.0 // indirect
Expand Down
21 changes: 12 additions & 9 deletions go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions helm-chart/kuberay-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ spec:
| featureGates[0].enabled | bool | `true` | |
| featureGates[1].name | string | `"RayJobDeletionPolicy"` | |
| featureGates[1].enabled | bool | `false` | |
| featureGates[2].name | string | `"RayServiceIncrementalUpgrade"` | |
| featureGates[2].enabled | bool | `false` | |
| metrics.enabled | bool | `true` | Whether KubeRay operator should emit control plane metrics. |
| metrics.serviceMonitor.enabled | bool | `false` | Enable a prometheus ServiceMonitor |
| metrics.serviceMonitor.interval | string | `"30s"` | Prometheus ServiceMonitor interval |
Expand Down
37 changes: 37 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions helm-chart/kuberay-operator/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,17 @@ rules:
- patch
- update
- watch
- apiGroups:
- gateway.networking.k8s.io
resources:
- gateways
- httproutes
verbs:
- create
- get
- list
- update
- watch
- apiGroups:
- networking.k8s.io
resources:
Expand Down
2 changes: 2 additions & 0 deletions helm-chart/kuberay-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ featureGates:
enabled: true
- name: RayJobDeletionPolicy
enabled: false
- name: RayServiceIncrementalUpgrade
enabled: false

# Configurations for KubeRay operator metrics.
metrics:
Expand Down
10 changes: 9 additions & 1 deletion ray-operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,16 @@ test-e2e-autoscaler: WHAT ?= ./test/e2eautoscaler
test-e2e-autoscaler: manifests fmt vet ## Run e2e autoscaler tests.
go test -timeout 30m -v $(WHAT)

test-e2e-rayservice: WHAT ?= ./test/e2erayservice
test-e2e-rayservice: manifests fmt vet ## Run e2e RayService tests.
go test -timeout 30m -v $(WHAT)

test-e2e-upgrade: WHAT ?= ./test/e2eupgrade
test-e2e-upgrade: manifests fmt vet ## Run e2e tests.
test-e2e-upgrade: manifests fmt vet ## Run e2e operator upgrade tests.
go test -timeout 30m -v $(WHAT)

test-e2e-incremental-upgrade: WHAT ?= ./test/e2eincrementalupgrade
test-e2e-incremental-upgrade: manifests fmt vet ## Run e2e RayService incremental upgrade tests.
go test -timeout 30m -v $(WHAT)

test-e2e-rayjob-submitter: WHAT ?= ./test/e2erayjobsubmitter
Expand Down
29 changes: 27 additions & 2 deletions ray-operator/apis/ray/v1/rayservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ const (
type RayServiceUpgradeType string

const (
// During upgrade, IncrementalUpgrade strategy will create an upgraded cluster to gradually scale
// and migrate traffic to using Gateway API.
IncrementalUpgrade RayServiceUpgradeType = "IncrementalUpgrade"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe too late to change this, but wondering if RollingUpgrade be a more appropriate name? I assume most people are more familiar with this term. WDYT @ryanaoleary @kevin85421 @MortalHappiness

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(not blocking this PR, we can cahnge it during alpha phase)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Late to reply to this, but I have no strong preference either way. IncrementalUpgrade is what was used in the feature request and REP so that's why I stuck with it, but if there's a preference from any KubeRay maintainers or users I'm down to go through and change the feature name / all the related variable names.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @rueian for sharing opinion.
I think RollingUpgrade is more a more straight forward name for me too

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc: @kevin85421 since from offline discussion you seemed to have a preference against using RollingUpgrade here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kevin85421 what do you think about ClusterUpgrade and ClusterUpgradeOptions? I prefer to keep the upgrade term generic as the exact behavior could be changed in the future.

@Future-Outlier was also wondering about the history of why we called it "incremental" upgrades.

Copy link
Member

@andrewsykim andrewsykim Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're still nit picking on the naming here, I think the new strategy type shoud be NewClusterWithUpgrade or NewClusterWithIncrementalUpgrade. This makes it more obvious that we are still upgrading a new cluster. IncrementalUpgrade makes it ambigious if the existing cluster or new cluster is being upgraded

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm ok with NewClusterWithIncrementalUpgrade, since this is a more intuitive name.

also would love to hear from @rueian and @kevin85421

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 for NewClusterWithIncrementalUpgrade.

Copy link
Collaborator Author

@ryanaoleary ryanaoleary Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me too, I'll make the change to NewClusterWithIncrementalUpgrade. Since the name still includes "incremental upgrade" I think most of the comments / internal helper functions can stay the same and I'll just update the public API.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready
NewCluster RayServiceUpgradeType = "NewCluster"
// No new cluster will be created while the strategy is set to None
Expand Down Expand Up @@ -57,10 +60,27 @@ var DeploymentStatusEnum = struct {
UNHEALTHY: "UNHEALTHY",
}

// These options are currently only supported for the IncrementalUpgrade type.
type ClusterUpgradeOptions struct {
// The capacity of serve requests the upgraded cluster should scale to handle each interval.
// Defaults to 100%.
// +kubebuilder:default:=100
MaxSurgePercent *int32 `json:"maxSurgePercent,omitempty"`
// The percentage of traffic to switch to the upgraded RayCluster at a set interval after scaling by MaxSurgePercent.
StepSizePercent *int32 `json:"stepSizePercent"`
Copy link
Member

@andrewsykim andrewsykim Oct 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason for stepSizePercent and IntervalSeconds not having defaults?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to require users to have to specify StepSizePercent and IntervalSeconds themselves if they enabled IncrementalUpgrade, since I wasn't sure what a standard default would be and we expect users to configure it themselves based on their workload.

I could set them to some safe values like 25% and 10 seconds respectively if that's preferred just for ease of use.

// The interval in seconds between transferring StepSize traffic from the old to new RayCluster.
IntervalSeconds *int32 `json:"intervalSeconds"`
// The name of the Gateway Class installed by the Kubernetes Cluster admin.
GatewayClassName string `json:"gatewayClassName"`
}

type RayServiceUpgradeStrategy struct {
// Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`.
// +optional
Type *RayServiceUpgradeType `json:"type,omitempty"`
// ClusterUpgradeOptions defines the behavior of an IncrementalUpgrade.
// RayServiceIncrementalUpgrade feature gate must be enabled to set ClusterUpgradeOptions.
ClusterUpgradeOptions *ClusterUpgradeOptions `json:"clusterUpgradeOptions,omitempty"`
}

// RayServiceSpec defines the desired state of RayService
Expand Down Expand Up @@ -130,6 +150,12 @@ type RayServiceStatus struct {
// +optional
Applications map[string]AppStatus `json:"applicationStatuses,omitempty"`
// +optional
TargetCapacity *int32 `json:"targetCapacity,omitempty"`
// +optional
TrafficRoutedPercent *int32 `json:"trafficRoutedPercent,omitempty"`
// +optional
LastTrafficMigratedTime *metav1.Time `json:"lastTrafficMigratedTime,omitempty"`
// +optional
RayClusterName string `json:"rayClusterName,omitempty"`
// +optional
RayClusterStatus RayClusterStatus `json:"rayClusterStatus,omitempty"`
Expand Down Expand Up @@ -184,8 +210,7 @@ const (
type RayService struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this change?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's the result of running make fmt

Spec RayServiceSpec `json:"spec,omitempty"`
Spec RayServiceSpec `json:"spec,omitempty"`
// +optional
Status RayServiceStatuses `json:"status,omitempty"`
}
Expand Down
49 changes: 49 additions & 0 deletions ray-operator/apis/ray/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading