Skip to content
12 changes: 9 additions & 3 deletions ray-operator/controllers/ray/utils/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,15 +393,21 @@ func CalculateMinReplicas(cluster *rayv1.RayCluster) int32 {

// CalculateMaxReplicas calculates max worker replicas at the cluster level
func CalculateMaxReplicas(cluster *rayv1.RayCluster) int32 {
count := int32(0)
count := int64(0)
for _, nodeGroup := range cluster.Spec.WorkerGroupSpecs {
if nodeGroup.Suspend != nil && *nodeGroup.Suspend {
continue
}
count += (*nodeGroup.MaxReplicas * nodeGroup.NumOfHosts)
count += int64(*nodeGroup.MaxReplicas) * int64(nodeGroup.NumOfHosts)

// early return if an overflow happens
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need this early return if we will do a safe type casting?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's a decision to be made. The early return is a non-critical optimization
but not as clean as Option 1. I'm happy to go with either way.

Option 1

func CalculateMaxReplicas(cluster *rayv1.RayCluster) int32 {
	count := int64(0)
	for _, nodeGroup := range cluster.Spec.WorkerGroupSpecs {
		if nodeGroup.Suspend != nil && *nodeGroup.Suspend {
			continue
		}
		count += int64(*nodeGroup.MaxReplicas) * int64(nodeGroup.NumOfHosts)
	}
	return SafeInt64ToInt32(count)

Option 2

func CalculateMaxReplicas(cluster *rayv1.RayCluster) int32 {
	count := int64(0)
	for _, nodeGroup := range cluster.Spec.WorkerGroupSpecs {
		if nodeGroup.Suspend != nil && *nodeGroup.Suspend {
			continue
		}
		count += int64(*nodeGroup.MaxReplicas) * int64(nodeGroup.NumOfHosts)

		// early return if an overflow happens
		if count > math.MaxInt32 {
			return math.MaxInt32
		}
	}
	return SafeInt64ToInt32(count)
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll go for option 1 but if others think option 2 is better then I am fine with it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Now with the SafeInt64ToInt32 doing the overflow/underflow prevention. I think Option 1 is much cleaner.

if count > math.MaxInt32 {
return math.MaxInt32
}
}

return count
// #nosec G115 -- safe: count is capped at math.MaxInt32 above
return int32(count)
}

// CalculateReadyReplicas calculates ready worker replicas at the cluster level
Expand Down
94 changes: 94 additions & 0 deletions ray-operator/controllers/ray/utils/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,100 @@ func TestCalculateDesiredReplicas(t *testing.T) {
}
}

func TestCalculateMaxReplicasOverflow(t *testing.T) {
tests := []struct {
name string
specs []rayv1.WorkerGroupSpec
expected int32
}{
{
name: "Bug reproduction: issue report with replicas=1, minReplicas=3, numOfHosts=4",
specs: []rayv1.WorkerGroupSpec{
{
GroupName: "workergroup",
Replicas: ptr.To[int32](1),
MinReplicas: ptr.To[int32](3),
MaxReplicas: ptr.To[int32](2147483647), // Default max int32
NumOfHosts: 4,
},
},
expected: 2147483647, // Was -4 before fix, should be capped at max int32
},
{
name: "Single group overflow with default maxReplicas and numOfHosts=4",
specs: []rayv1.WorkerGroupSpec{
{
NumOfHosts: 4,
MinReplicas: ptr.To[int32](3),
MaxReplicas: ptr.To[int32](2147483647),
},
},
expected: 2147483647, // Should be capped at max int32
},
{
name: "Single group overflow with large values",
specs: []rayv1.WorkerGroupSpec{
{
NumOfHosts: 1000,
MinReplicas: ptr.To[int32](1),
MaxReplicas: ptr.To[int32](2147483647),
},
},
expected: 2147483647, // Should be capped
},
{
name: "Multiple groups causing overflow when summed",
specs: []rayv1.WorkerGroupSpec{
{
NumOfHosts: 2,
MinReplicas: ptr.To[int32](1),
MaxReplicas: ptr.To[int32](1500000000),
},
{
NumOfHosts: 1,
MinReplicas: ptr.To[int32](1),
MaxReplicas: ptr.To[int32](1000000000),
},
},
expected: 2147483647, // 3B + 1B > max int32, should be capped
},
{
name: "No overflow with reasonable values",
specs: []rayv1.WorkerGroupSpec{
{
NumOfHosts: 4,
MinReplicas: ptr.To[int32](2),
MaxReplicas: ptr.To[int32](100),
},
},
expected: 400, // 100 * 4 = 400, no overflow
},
{
name: "Edge case: exactly at max int32",
specs: []rayv1.WorkerGroupSpec{
{
NumOfHosts: 1,
MinReplicas: ptr.To[int32](1),
MaxReplicas: ptr.To[int32](2147483647),
},
},
expected: 2147483647, // Exactly at limit
},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
cluster := &rayv1.RayCluster{
Spec: rayv1.RayClusterSpec{
WorkerGroupSpecs: tc.specs,
},
}
result := CalculateMaxReplicas(cluster)
assert.Equal(t, tc.expected, result)
})
}
}

func TestUnmarshalRuntimeEnv(t *testing.T) {
tests := []struct {
name string
Expand Down
Loading