From c914ea83a902c64bb0a0700f891088574b17e8a7 Mon Sep 17 00:00:00 2001 From: justinyeh1995 Date: Thu, 30 Oct 2025 18:10:46 +0800 Subject: [PATCH 1/6] [Fix] Resolve int32 overflow by having the calculation in int64 and cap it if the count is over math.MaxInt32 Signed-off-by: justinyeh1995 --- ray-operator/controllers/ray/utils/util.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 540162bf9f4..55bdd384822 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -393,15 +393,20 @@ func CalculateMinReplicas(cluster *rayv1.RayCluster) int32 { // CalculateMaxReplicas calculates max worker replicas at the cluster level func CalculateMaxReplicas(cluster *rayv1.RayCluster) int32 { - count := int32(0) + count := int64(0) for _, nodeGroup := range cluster.Spec.WorkerGroupSpecs { if nodeGroup.Suspend != nil && *nodeGroup.Suspend { continue } - count += (*nodeGroup.MaxReplicas * nodeGroup.NumOfHosts) + count += int64(*nodeGroup.MaxReplicas) * int64(nodeGroup.NumOfHosts) + + // early return if an overflow happens + if count > math.MaxInt32 { + return math.MaxInt32 + } } - return count + return int32(count) } // CalculateReadyReplicas calculates ready worker replicas at the cluster level From 4e28956f57090c3fc8d887c0c129be0d66ed1e2b Mon Sep 17 00:00:00 2001 From: justinyeh1995 Date: Thu, 30 Oct 2025 18:27:27 +0800 Subject: [PATCH 2/6] [Test] Add unit tests for CalculateReadyReplicas Signed-off-by: justinyeh1995 --- .../controllers/ray/utils/util_test.go | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/ray-operator/controllers/ray/utils/util_test.go b/ray-operator/controllers/ray/utils/util_test.go index 8bd37a2e7f8..9a9371acd05 100644 --- a/ray-operator/controllers/ray/utils/util_test.go +++ b/ray-operator/controllers/ray/utils/util_test.go @@ -805,6 +805,100 @@ func TestCalculateDesiredReplicas(t *testing.T) { } } +func TestCalculateMaxReplicasOverflow(t *testing.T) { + tests := []struct { + name string + specs []rayv1.WorkerGroupSpec + expected int32 + }{ + { + name: "Bug reproduction: issue report with replicas=1, minReplicas=3, numOfHosts=4", + specs: []rayv1.WorkerGroupSpec{ + { + GroupName: "workergroup", + Replicas: ptr.To[int32](1), + MinReplicas: ptr.To[int32](3), + MaxReplicas: ptr.To[int32](2147483647), // Default max int32 + NumOfHosts: 4, + }, + }, + expected: 2147483647, // Was -4 before fix, should be capped at max int32 + }, + { + name: "Single group overflow with default maxReplicas and numOfHosts=4", + specs: []rayv1.WorkerGroupSpec{ + { + NumOfHosts: 4, + MinReplicas: ptr.To[int32](3), + MaxReplicas: ptr.To[int32](2147483647), + }, + }, + expected: 2147483647, // Should be capped at max int32 + }, + { + name: "Single group overflow with large values", + specs: []rayv1.WorkerGroupSpec{ + { + NumOfHosts: 1000, + MinReplicas: ptr.To[int32](1), + MaxReplicas: ptr.To[int32](2147483647), + }, + }, + expected: 2147483647, // Should be capped + }, + { + name: "Multiple groups causing overflow when summed", + specs: []rayv1.WorkerGroupSpec{ + { + NumOfHosts: 2, + MinReplicas: ptr.To[int32](1), + MaxReplicas: ptr.To[int32](1500000000), + }, + { + NumOfHosts: 1, + MinReplicas: ptr.To[int32](1), + MaxReplicas: ptr.To[int32](1000000000), + }, + }, + expected: 2147483647, // 3B + 1B > max int32, should be capped + }, + { + name: "No overflow with reasonable values", + specs: []rayv1.WorkerGroupSpec{ + { + NumOfHosts: 4, + MinReplicas: ptr.To[int32](2), + MaxReplicas: ptr.To[int32](100), + }, + }, + expected: 400, // 100 * 4 = 400, no overflow + }, + { + name: "Edge case: exactly at max int32", + specs: []rayv1.WorkerGroupSpec{ + { + NumOfHosts: 1, + MinReplicas: ptr.To[int32](1), + MaxReplicas: ptr.To[int32](2147483647), + }, + }, + expected: 2147483647, // Exactly at limit + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + cluster := &rayv1.RayCluster{ + Spec: rayv1.RayClusterSpec{ + WorkerGroupSpecs: tc.specs, + }, + } + result := CalculateMaxReplicas(cluster) + assert.Equal(t, tc.expected, result) + }) + } +} + func TestUnmarshalRuntimeEnv(t *testing.T) { tests := []struct { name string From ae526b37a10f3786002c9c5c522f304b96f1ed08 Mon Sep 17 00:00:00 2001 From: justinyeh1995 Date: Thu, 30 Oct 2025 23:19:30 +0800 Subject: [PATCH 3/6] [Fix] Add a nosec comment to pass the Lint (pre-commit) test Signed-off-by: justinyeh1995 --- ray-operator/controllers/ray/utils/util.go | 1 + 1 file changed, 1 insertion(+) diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 55bdd384822..60acf6c0798 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -406,6 +406,7 @@ func CalculateMaxReplicas(cluster *rayv1.RayCluster) int32 { } } + // #nosec G115 -- safe: count is capped at math.MaxInt32 above return int32(count) } From f8fab1ca7e67c8a48eb68d4c54145a86688c9498 Mon Sep 17 00:00:00 2001 From: justinyeh1995 Date: Thu, 6 Nov 2025 00:00:14 +0800 Subject: [PATCH 4/6] [Refactor] Add CapInt64ToInt32 to replace #nosec directives Signed-off-by: justinyeh1995 --- ray-operator/controllers/ray/utils/util.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 60acf6c0798..0fb9069adab 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -253,6 +253,14 @@ func SafeUint64ToInt64(n uint64) int64 { return int64(n) } +// CapInt64ToInt32 converts int64 to int32, capping at math.MaxInt32 if the value exceeds it. +func CapInt64ToInt32(n int64) int32 { + if n > math.MaxInt32 { + return math.MaxInt32 + } + return int32(n) +} + // GetNamespace return namespace func GetNamespace(metaData metav1.ObjectMeta) string { if metaData.Namespace == "" { @@ -406,8 +414,7 @@ func CalculateMaxReplicas(cluster *rayv1.RayCluster) int32 { } } - // #nosec G115 -- safe: count is capped at math.MaxInt32 above - return int32(count) + return CapInt64ToInt32(count) } // CalculateReadyReplicas calculates ready worker replicas at the cluster level From 74d67203d3ba805a3d18223b05d801ddce0b08fc Mon Sep 17 00:00:00 2001 From: justinyeh1995 Date: Thu, 6 Nov 2025 11:39:06 +0800 Subject: [PATCH 5/6] [Refactor] Rename function to SafeInt64ToInt32 and add a underflowing prevention (it also help pass the lint test) Signed-off-by: justinyeh1995 --- ray-operator/controllers/ray/utils/util.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 0fb9069adab..1692bf35ca2 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -253,11 +253,15 @@ func SafeUint64ToInt64(n uint64) int64 { return int64(n) } -// CapInt64ToInt32 converts int64 to int32, capping at math.MaxInt32 if the value exceeds it. -func CapInt64ToInt32(n int64) int32 { +// SafeInt64ToInt32 converts int64 to int32, preventing overflow/underflow by +// bounding the value between [math.MinInt32, math.MaxInt32] +func SafeInt64ToInt32(n int64) int32 { if n > math.MaxInt32 { return math.MaxInt32 } + if n < math.MinInt32 { + return math.MinInt32 + } return int32(n) } @@ -414,7 +418,7 @@ func CalculateMaxReplicas(cluster *rayv1.RayCluster) int32 { } } - return CapInt64ToInt32(count) + return SafeInt64ToInt32(count) } // CalculateReadyReplicas calculates ready worker replicas at the cluster level From 56c28b0da6189552bcf8273b3dd3cf600aafdc38 Mon Sep 17 00:00:00 2001 From: justinyeh1995 Date: Thu, 6 Nov 2025 11:42:56 +0800 Subject: [PATCH 6/6] [Refactor] Remove the early return as SafeInt64ToInt32 handles the int32 overflow and underflow checking. Signed-off-by: justinyeh1995 --- ray-operator/controllers/ray/utils/util.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 1692bf35ca2..7544cb31387 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -411,11 +411,6 @@ func CalculateMaxReplicas(cluster *rayv1.RayCluster) int32 { continue } count += int64(*nodeGroup.MaxReplicas) * int64(nodeGroup.NumOfHosts) - - // early return if an overflow happens - if count > math.MaxInt32 { - return math.MaxInt32 - } } return SafeInt64ToInt32(count)