From d8270b288196bbce16a70bde556d37910aacb4a6 Mon Sep 17 00:00:00 2001
From: win5923 <ken89@kimo.com>
Date: Thu, 30 Oct 2025 17:23:56 +0000
Subject: [PATCH 1/2] [Docs] Add Volcano RayJob gang scheduling example

Signed-off-by: win5923 <ken89@kimo.com>
---
 .../kubernetes/k8s-ecosystem/volcano.md       | 143 ++++++++++++++++++
 1 file changed, 143 insertions(+)

diff --git a/doc/source/cluster/kubernetes/k8s-ecosystem/volcano.md b/doc/source/cluster/kubernetes/k8s-ecosystem/volcano.md
index 8c2a7f1436f0..19933bbc159b 100644
--- a/doc/source/cluster/kubernetes/k8s-ecosystem/volcano.md
+++ b/doc/source/cluster/kubernetes/k8s-ecosystem/volcano.md
@@ -324,3 +324,146 @@ Finally, clean up the remaining cluster and queue:
 kubectl delete raycluster test-cluster-1
 kubectl delete queue kuberay-test-queue
 ```
+
+### Step 6: Use Volcano for RayJob gang scheduling
+
+Starting with KubeRay 1.5.0, KubeRay supports gang scheduling for RayJob custom resources.
+
+First, create a queue with a capacity of 4 CPUs and 6Gi of RAM and RayJob a with a head node (1 CPU + 2Gi of RAM), two workers (1 CPU + 1Gi of RAM each) and a submitter pod (0.5 CPU + 200Mi of RAM), for a total of 3500m CPU and 4296Mi of RAM
+
+```shell
+curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/ray-job.volcano-scheduler-queue.yaml
+kubectl apply -f ray-job.volcano-scheduler-queue.yaml
+```
+
+Wait until all pods are in the Running state.
+```shell
+kubectl get pod
+
+# NAME                                             READY   STATUS      RESTARTS   AGE
+# rayjob-sample-0-k449j-head-rlgxj                 1/1     Running     0          93s
+# rayjob-sample-0-k449j-small-group-worker-c6dt8   1/1     Running     0          93s
+# rayjob-sample-0-k449j-small-group-worker-cq6xn   1/1     Running     0          93s
+# rayjob-sample-0-qmm8s                            0/1     Completed   0          32s
+```
+
+Add an additional RayJob with the same configuration but with a different name
+```shell
+sed 's/rayjob-sample-0/rayjob-sample-1/' ray-job.volcano-scheduler-queue.yaml | kubectl apply -f-
+```
+
+All the pods stuck on pending for new RayJob
+```shell
+# NAME                                             READY   STATUS      RESTARTS   AGE
+# rayjob-sample-0-k449j-head-rlgxj                 1/1     Running     0          3m27s
+# rayjob-sample-0-k449j-small-group-worker-c6dt8   1/1     Running     0          3m27s
+# rayjob-sample-0-k449j-small-group-worker-cq6xn   1/1     Running     0          3m27s
+# rayjob-sample-0-qmm8s                            0/1     Completed   0          2m26s
+# rayjob-sample-1-mvgqf-head-qb7wm                 0/1     Pending     0          21s
+# rayjob-sample-1-mvgqf-small-group-worker-jfzt5   0/1     Pending     0          21s
+# rayjob-sample-1-mvgqf-small-group-worker-ng765   0/1     Pending     0          21s
+```
+
+Check the status of its PodGroup to see that its phase is `Pending` and the last status is `Unschedulable`:
+```shell
+kubectl get podgroup ray-rayjob-sample-1-pg  -o yaml
+
+# apiVersion: scheduling.volcano.sh/v1beta1
+# kind: PodGroup
+# metadata:
+#   creationTimestamp: "2025-10-30T17:10:18Z"
+#   generation: 2
+#   name: ray-rayjob-sample-1-pg
+#   namespace: default
+#   ownerReferences:
+#   - apiVersion: ray.io/v1
+#     blockOwnerDeletion: true
+#     controller: true
+#     kind: RayJob
+#     name: rayjob-sample-1
+#     uid: 5835c896-c75d-4692-b10a-2871a79f141a
+#   resourceVersion: "3226"
+#   uid: 9fd55cbd-ba69-456d-b305-f61ffd6d935d
+# spec:
+#   minMember: 3
+#   minResources:
+#     cpu: 3500m
+#     memory: 4296Mi
+#   queue: kuberay-test-queue
+# status:
+#   conditions:
+#   - lastTransitionTime: "2025-10-30T17:10:18Z"
+#     message: '3/3 tasks in gang unschedulable: pod group is not ready, 3 Pending,
+#       3 minAvailable; Pending: 3 Unschedulable'
+#     reason: NotEnoughResources
+#     status: "True"
+#     transitionID: 7866f533-6590-4a4d-83cf-8f1db0214609
+#     type: Unschedulable
+#   phase: Pending
+```
+
+Delete the first RayJob to make space in the queue
+```shell
+kubectl delete rayjob rayjob-sample-0
+```
+
+The PodGroup for the second cluster changed to the Running state, because enough resources are now available to schedule the entire set of pods
+```shell
+kubectl get podgroup ray-rayjob-sample-1-pg  -o yaml
+
+# apiVersion: scheduling.volcano.sh/v1beta1
+# kind: PodGroup
+# metadata:
+#   creationTimestamp: "2025-10-30T17:10:18Z"
+#   generation: 7
+#   name: ray-rayjob-sample-1-pg
+#   namespace: default
+#   ownerReferences:
+#   - apiVersion: ray.io/v1
+#     blockOwnerDeletion: true
+#     controller: true
+#     kind: RayJob
+#     name: rayjob-sample-1
+#     uid: 5835c896-c75d-4692-b10a-2871a79f141a
+#   resourceVersion: "3724"
+#   uid: 9fd55cbd-ba69-456d-b305-f61ffd6d935d
+# spec:
+#   minMember: 3
+#   minResources:
+#     cpu: 3500m
+#     memory: 4296Mi
+#   queue: kuberay-test-queue
+# status:
+#   conditions:
+#   - lastTransitionTime: "2025-10-30T17:10:18Z"
+#     message: '3/3 tasks in gang unschedulable: pod group is not ready, 3 Pending,
+#       3 minAvailable; Pending: 3 Unschedulable'
+#     reason: NotEnoughResources
+#     status: "True"
+#     transitionID: 7866f533-6590-4a4d-83cf-8f1db0214609
+#     type: Unschedulable
+#   - lastTransitionTime: "2025-10-30T17:14:44Z"
+#     reason: tasks in gang are ready to be scheduled
+#     status: "True"
+#     transitionID: 36e0222d-eee3-444a-9889-5b9c255f41af
+#     type: Scheduled
+#   phase: Running
+#   running: 4
+```
+
+Check the pods again to see that the second RayJob is now up and running:
+```shell
+kubectl get pod
+# NAME                                             READY   STATUS      RESTARTS   AGE
+# rayjob-sample-1-mvgqf-head-qb7wm                 1/1     Running     0          5m47s
+# rayjob-sample-1-mvgqf-small-group-worker-jfzt5   1/1     Running     0          5m47s
+# rayjob-sample-1-mvgqf-small-group-worker-ng765   1/1     Running     0          5m47s
+# rayjob-sample-1-tcd4m                            0/1     Completed   0          84s
+```
+
+Finally, clean up the remaining rayjob, queue and configmap:
+```
+kubectl delete rayjob rayjob-sample-1
+kubectl delete queue kuberay-test-queue
+kubectl delete configmap ray-job-code-sample
+```

From b58c6836e46f491e171685f1cd3cd21a199c7a68 Mon Sep 17 00:00:00 2001
From: win5923 <ken89@kimo.com>
Date: Fri, 31 Oct 2025 17:02:10 +0000
Subject: [PATCH 2/2] Addressed Future-Outlier's comments

Signed-off-by: win5923 <ken89@kimo.com>
---
 doc/source/cluster/kubernetes/k8s-ecosystem/volcano.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/source/cluster/kubernetes/k8s-ecosystem/volcano.md b/doc/source/cluster/kubernetes/k8s-ecosystem/volcano.md
index 19933bbc159b..ec13376d6a97 100644
--- a/doc/source/cluster/kubernetes/k8s-ecosystem/volcano.md
+++ b/doc/source/cluster/kubernetes/k8s-ecosystem/volcano.md
@@ -325,14 +325,14 @@ kubectl delete raycluster test-cluster-1
 kubectl delete queue kuberay-test-queue
 ```
 
-### Step 6: Use Volcano for RayJob gang scheduling
+### Use Volcano for RayJob gang scheduling
 
 Starting with KubeRay 1.5.0, KubeRay supports gang scheduling for RayJob custom resources.
 
 First, create a queue with a capacity of 4 CPUs and 6Gi of RAM and RayJob a with a head node (1 CPU + 2Gi of RAM), two workers (1 CPU + 1Gi of RAM each) and a submitter pod (0.5 CPU + 200Mi of RAM), for a total of 3500m CPU and 4296Mi of RAM
 
 ```shell
-curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/ray-job.volcano-scheduler-queue.yaml
+curl -LO https://raw.githubusercontent.com/ray-project/kuberay/refs/tags/v1.5.0-rc.0/ray-operator/config/samples/ray-job.volcano-scheduler-queue.yaml
 kubectl apply -f ray-job.volcano-scheduler-queue.yaml
 ```
 
@@ -402,12 +402,12 @@ kubectl get podgroup ray-rayjob-sample-1-pg  -o yaml
 #   phase: Pending
 ```
 
-Delete the first RayJob to make space in the queue
+Delete the first RayJob to make space in the queue.
 ```shell
 kubectl delete rayjob rayjob-sample-0
 ```
 
-The PodGroup for the second cluster changed to the Running state, because enough resources are now available to schedule the entire set of pods
+The PodGroup for the second cluster changed to the Running state, because enough resources are now available to schedule the entire set of pods.
 ```shell
 kubectl get podgroup ray-rayjob-sample-1-pg  -o yaml