sgl-project · Kangyan-Zhou · Jan 26, 2026 · Jan 22, 2026 · Jan 23, 2026
@@ -274,19 +274,14 @@ jobs:
             const pollIntervalSeconds = 120;  // 2 minutes to reduce GH API calls
             const maxAttempts = (maxWaitMinutes * 60) / pollIntervalSeconds;
 
-            // Stage-b jobs to wait for (all stage-b tests including performance and accuracy)
+            // Stage-b jobs to wait for
             const stageBJobs = [
               { prefix: 'stage-b-test-small-1-gpu', expectedCount: 8 },              // partitions 0-7
-              { prefix: 'stage-b-test-large-1-gpu', expectedCount: 12 },             // partitions 0-11
-              { prefix: 'stage-b-test-large-2-gpu', expectedCount: 2 },              // partitions 0-1
+              { prefix: 'stage-b-test-large-1-gpu', expectedCount: 14 },             // partitions 0-13
+              { prefix: 'stage-b-test-large-2-gpu', expectedCount: 4 },              // partitions 0-3
               { prefix: 'stage-b-test-4-gpu-b200', expectedCount: 1 },
-              { prefix: 'stage-b-test-small-1-gpu-performance', expectedCount: 1 },
-              { prefix: 'stage-b-test-large-1-gpu-performance', expectedCount: 2 },  // partitions 0-1
-              { prefix: 'stage-b-test-large-2-gpu-performance', expectedCount: 1 },
-              { prefix: 'stage-b-test-small-1-gpu-accuracy', expectedCount: 1 },
-              { prefix: 'stage-b-test-large-2-gpu-accuracy', expectedCount: 1 }
             ];
-            const totalExpectedJobs = stageBJobs.reduce((sum, j) => sum + j.expectedCount, 0);  // 29 total
+            const totalExpectedJobs = stageBJobs.reduce((sum, j) => sum + j.expectedCount, 0);  // 27 total
 
             // Helper to match job names exactly (prefix alone or prefix + " (N)" for matrix jobs)
             const matchesPrefix = (jobName, prefix) => {
@@ -841,6 +836,9 @@ jobs:
         run: |
           source /etc/profile.d/sglang-ci.sh
           CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
 
       - name: Run test
         timeout-minutes: 30
@@ -874,7 +872,7 @@ jobs:
       fail-fast: false
       max-parallel: ${{ fromJson(needs.check-changes.outputs.max_parallel) }}
       matrix:
-        partition: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+        partition: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -902,7 +900,7 @@ jobs:
           if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
             CONTINUE_ON_ERROR_FLAG="--continue-on-error"
           fi
-          python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 12 $CONTINUE_ON_ERROR_FLAG
+          python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 14 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG
 
   stage-b-test-large-2-gpu:
     needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
@@ -923,245 +921,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        partition: [0, 1]
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
-
-      - name: Download artifacts
-        if: needs.check-changes.outputs.sgl_kernel == 'true'
-        uses: actions/download-artifact@v4
-        with:
-          path: sgl-kernel/dist/
-          merge-multiple: true
-          pattern: wheel-python3.10-cuda12.9
-
-      - name: Install dependencies
-        timeout-minutes: 10
-        run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
-
-      - name: Run test
-        timeout-minutes: 30
-        run: |
-          cd test/
-          CONTINUE_ON_ERROR_FLAG=""
-          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
-            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
-          fi
-          python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 2 $CONTINUE_ON_ERROR_FLAG
-
-  stage-b-test-small-1-gpu-performance:
-    needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
-    if: |
-      always() &&
-      (
-        (inputs.target_stage == 'stage-b-test-small-1-gpu-performance') ||
-        (
-          !inputs.target_stage &&
-          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
-          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
-        )
-      )
-    runs-on: 1-gpu-5090
-    timeout-minutes: 240
-    env:
-      RUNNER_LABELS: 1-gpu-5090
-      IS_BLACKWELL: "1"
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
-
-      - name: Download artifacts
-        if: needs.check-changes.outputs.sgl_kernel == 'true'
-        uses: actions/download-artifact@v4
-        with:
-          path: sgl-kernel/dist/
-          merge-multiple: true
-          pattern: wheel-python3.10-cuda12.9
-
-      - name: Install dependencies
-        timeout-minutes: 10
-        run: |
-          source /etc/profile.d/sglang-ci.sh
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
-
-      - name: Run test
-        timeout-minutes: 30
-        run: |
-          source /etc/profile.d/sglang-ci.sh
-          cd test/
-          CONTINUE_ON_ERROR_FLAG=""
-          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
-            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
-          fi
-          python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu-performance $CONTINUE_ON_ERROR_FLAG
-
-  stage-b-test-large-1-gpu-performance:
-    needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
-    if: |
-      always() &&
-      (
-        (inputs.target_stage == 'stage-b-test-large-1-gpu-performance') ||
-        (
-          !inputs.target_stage &&
-          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
-          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
-        )
-      )
-    runs-on: 1-gpu-runner
-    timeout-minutes: 240
-    env:
-      RUNNER_LABELS: 1-gpu-runner
-    strategy:
-      fail-fast: false
-      matrix:
-        partition: [0, 1]
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
-
-      - name: Download artifacts
-        if: needs.check-changes.outputs.sgl_kernel == 'true'
-        uses: actions/download-artifact@v4
-        with:
-          path: sgl-kernel/dist/
-          merge-multiple: true
-          pattern: wheel-python3.10-cuda12.9
-
-      - name: Install dependencies
-        timeout-minutes: 10
-        run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
-
-      - name: Run test
-        timeout-minutes: 40
-        run: |
-          cd test/
-          CONTINUE_ON_ERROR_FLAG=""
-          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
-            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
-          fi
-          python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu-performance --auto-partition-id ${{ matrix.partition }} --auto-partition-size 2 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG
-
-  stage-b-test-large-2-gpu-performance:
-    needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
-    if: |
-      always() &&
-      (
-        (inputs.target_stage == 'stage-b-test-large-2-gpu-performance') ||
-        (
-          !inputs.target_stage &&
-          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
-          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
-        )
-      )
-    runs-on: 2-gpu-runner
-    timeout-minutes: 240
-    env:
-      RUNNER_LABELS: 2-gpu-runner
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
-
-      - name: Download artifacts
-        if: needs.check-changes.outputs.sgl_kernel == 'true'
-        uses: actions/download-artifact@v4
-        with:
-          path: sgl-kernel/dist/
-          merge-multiple: true
-          pattern: wheel-python3.10-cuda12.9
-
-      - name: Install dependencies
-        timeout-minutes: 10
-        run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
-
-      - name: Run test
-        timeout-minutes: 30
-        run: |
-          cd test/
-          CONTINUE_ON_ERROR_FLAG=""
-          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
-            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
-          fi
-          python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu-performance $CONTINUE_ON_ERROR_FLAG
-
-  stage-b-test-small-1-gpu-accuracy:
-    needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
-    if: |
-      always() &&
-      (
-        (inputs.target_stage == 'stage-b-test-small-1-gpu-accuracy') ||
-        (
-          !inputs.target_stage &&
-          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
-          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
-        )
-      )
-    runs-on: 1-gpu-5090
-    timeout-minutes: 240
-    env:
-      RUNNER_LABELS: 1-gpu-5090
-      IS_BLACKWELL: "1"
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
-
-      - name: Download artifacts
-        if: needs.check-changes.outputs.sgl_kernel == 'true'
-        uses: actions/download-artifact@v4
-        with:
-          path: sgl-kernel/dist/
-          merge-multiple: true
-          pattern: wheel-python3.10-cuda12.9
-
-      - name: Install dependencies
-        timeout-minutes: 10
-        run: |
-          source /etc/profile.d/sglang-ci.sh
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
-          git clone https://github.com/merrymercy/human-eval.git
-          cd human-eval
-          pip install -e .
-
-      - name: Run test
-        timeout-minutes: 25
-        run: |
-          source /etc/profile.d/sglang-ci.sh
-          cd test/
-          CONTINUE_ON_ERROR_FLAG=""
-          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
-            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
-          fi
-          python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu-accuracy $CONTINUE_ON_ERROR_FLAG
-
-  stage-b-test-large-2-gpu-accuracy:
-    needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
-    if: |
-      always() &&
-      (
-        (inputs.target_stage == 'stage-b-test-large-2-gpu-accuracy') ||
-        (
-          !inputs.target_stage &&
-          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
-          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
-        )
-      )
-    runs-on: 2-gpu-runner
-    timeout-minutes: 240
-    env:
-      RUNNER_LABELS: 2-gpu-runner
+        partition: [0, 1, 2, 3]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -1185,14 +945,14 @@ jobs:
           pip install -e .
 
       - name: Run test
-        timeout-minutes: 25
+        timeout-minutes: 30
         run: |
           cd test/
           CONTINUE_ON_ERROR_FLAG=""
           if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
             CONTINUE_ON_ERROR_FLAG="--continue-on-error"
           fi
-          python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu-accuracy $CONTINUE_ON_ERROR_FLAG
+          python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 4 $CONTINUE_ON_ERROR_FLAG
 
   stage-b-test-4-gpu-b200:
     needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
@@ -1829,11 +1589,6 @@ jobs:
         stage-b-test-small-1-gpu,
         stage-b-test-large-1-gpu,
         stage-b-test-large-2-gpu,
-        stage-b-test-small-1-gpu-performance,
-        stage-b-test-large-1-gpu-performance,
-        stage-b-test-large-2-gpu-performance,
-        stage-b-test-small-1-gpu-accuracy,
-        stage-b-test-large-2-gpu-accuracy,
         stage-c-test-large-4-gpu,
         stage-b-test-4-gpu-b200,
         unit-test-backend-4-gpu,

diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
@@ -1495,6 +1495,10 @@ def run_bench_one_batch(model, other_args):
         command += ["--model-path", model]
     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
+    prefill_latency = None
+    decode_throughput = None
+    decode_latency = None
+
     try:
         stdout, stderr = process.communicate()
         output = stdout.decode(errors="backslashreplace")
@@ -1517,6 +1521,12 @@ def run_bench_one_batch(model, other_args):
     finally:
         kill_process_tree(process.pid)
 
+    if prefill_latency is None or decode_throughput is None or decode_latency is None:
+        raise RuntimeError(
+            f"Failed to parse benchmark output. "
+            f"prefill_latency={prefill_latency}, decode_throughput={decode_throughput}, decode_latency={decode_latency}"
+        )
+
     return prefill_latency, decode_throughput, decode_latency
 
 

diff --git a/test/registered/eval/test_eval_accuracy_large.py b/test/registered/eval/test_eval_accuracy_large.py
@@ -19,8 +19,8 @@
     write_github_step_summary,
 )
 
-register_cuda_ci(est_time=300, suite="stage-b-test-small-1-gpu-accuracy")
-register_amd_ci(est_time=300, suite="stage-b-test-small-1-gpu-accuracy-amd")
+register_cuda_ci(est_time=300, suite="stage-b-test-small-1-gpu")
+register_amd_ci(est_time=300, suite="stage-b-test-small-1-gpu-amd")
 
 
 class TestEvalAccuracyLarge(CustomTestCase):

diff --git a/test/registered/eval/test_moe_eval_accuracy_large.py b/test/registered/eval/test_moe_eval_accuracy_large.py
@@ -19,8 +19,8 @@
     write_github_step_summary,
 )
 
-register_cuda_ci(est_time=500, suite="stage-b-test-large-2-gpu-accuracy")
-register_amd_ci(est_time=500, suite="stage-b-test-large-2-gpu-accuracy-amd")
+register_cuda_ci(est_time=500, suite="stage-b-test-large-2-gpu")
+register_amd_ci(est_time=500, suite="stage-b-test-large-2-gpu-amd")
 
 
 class TestMoEEvalAccuracyLarge(CustomTestCase):

diff --git a/test/registered/perf/test_bench_one_batch_1gpu.py b/test/registered/perf/test_bench_one_batch_1gpu.py
@@ -11,8 +11,8 @@
     write_github_step_summary,
 )
 
-register_cuda_ci(est_time=120, suite="stage-b-test-large-1-gpu-performance")
-register_amd_ci(est_time=120, suite="stage-b-test-large-1-gpu-performance-amd")
+register_cuda_ci(est_time=120, suite="stage-b-test-large-1-gpu")
+register_amd_ci(est_time=120, suite="stage-b-test-large-1-gpu-amd")
 
 
 class TestBenchOneBatch1GPU(CustomTestCase):

diff --git a/test/registered/perf/test_bench_one_batch_2gpu.py b/test/registered/perf/test_bench_one_batch_2gpu.py
@@ -11,8 +11,8 @@
     write_github_step_summary,
 )
 
-register_cuda_ci(est_time=180, suite="stage-b-test-large-2-gpu-performance")
-register_amd_ci(est_time=630, suite="stage-b-test-large-2-gpu-performance-amd")
+register_cuda_ci(est_time=180, suite="stage-b-test-large-2-gpu")
+register_amd_ci(est_time=630, suite="stage-b-test-large-2-gpu-amd")
 
 
 class TestBenchOneBatch2GPU(CustomTestCase):