diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index 5b53e2ae255c..55c00d532349 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -713,23 +713,23 @@ jobs:
       - name: Benchmark single latency
         timeout-minutes: 20
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_small
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_default
 
       - name: Benchmark online latency
         timeout-minutes: 15
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_online_latency_default
 
       - name: Benchmark offline throughput
         timeout-minutes: 15
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_default
 
       - name: Benchmark offline throughput (Non-streaming, small batch size)
         timeout-minutes: 15
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_non_stream_small_batch_size
 
   performance-test-1-gpu-part-2-amd:
     needs: [check-changes, stage-a-test-1-amd]
@@ -768,17 +768,17 @@ jobs:
       - name: Benchmark offline throughput (w/o RadixAttention)
         timeout-minutes: 15
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_without_radix_cache
 
       - name: Benchmark offline throughput (w/ Triton)
         timeout-minutes: 15
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_with_triton_attention_backend
 
       - name: Benchmark offline throughput (w/ FP8)
         timeout-minutes: 15
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_large.TestBenchServing1GPULarge.test_offline_throughput_default_fp8
 
   performance-test-2-gpu-amd:
     needs: [check-changes, stage-a-test-1-amd]
@@ -822,32 +822,32 @@ jobs:
       - name: Benchmark single latency (TP=2)
         timeout-minutes: 25
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_moe_tp2_bs1
 
       - name: Benchmark single latency + torch.compile (TP=2)
         timeout-minutes: 25
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_torch_compile_tp2_bs1
 
       - name: Benchmark offline throughput (TP=2)
         timeout-minutes: 25
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_default
 
       - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
         timeout-minutes: 25
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_without_radix_cache
 
       - name: Benchmark offline PP decode throughput (PP=2)
         timeout-minutes: 10
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_offline_throughput_default_decode
 
       - name: Benchmark offline PP prefill throughput (PP=2)
         timeout-minutes: 10
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_long_context_prefill
 
   accuracy-test-1-gpu-amd:
     needs: [check-changes, stage-a-test-1-amd]
@@ -886,7 +886,7 @@ jobs:
       - name: Evaluate Accuracy
         timeout-minutes: 30
         run: |
-          bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/eval -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
 
   accuracy-test-2-gpu-amd:
     needs: [check-changes, accuracy-test-1-gpu-amd]
@@ -926,7 +926,7 @@ jobs:
       - name: Evaluate accuracy (TP=2)
         timeout-minutes: 30
         run: |
-          bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER_AR=0 -e SGLANG_USE_AITER=0 -e HF_HUB_ENABLE_HF_TRANSFER=0 python3 test_moe_eval_accuracy_large.py
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/eval -e SGLANG_USE_AITER_AR=0 -e SGLANG_USE_AITER=0 -e HF_HUB_ENABLE_HF_TRANSFER=0 python3 test_moe_eval_accuracy_large.py
 
   pr-test-amd-finish:
     needs:
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 1d71b283ec08..2441ac428764 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -186,6 +186,7 @@ jobs:
     needs: [check-changes, call-gate]
     if: needs.check-changes.outputs.sgl_kernel == 'true'
     runs-on: x64-kernel-build-node
+    timeout-minutes: 60
     strategy:
       matrix:
         include:
@@ -233,6 +234,7 @@ jobs:
     needs: [check-changes, call-gate]
     if: needs.check-changes.outputs.sgl_kernel == 'true'
     runs-on: arm-kernel-build-node
+    timeout-minutes: 60
     strategy:
       matrix:
         include:
@@ -283,6 +285,7 @@ jobs:
       !inputs.target_stage &&
       needs.check-changes.outputs.sgl_kernel == 'true'
     runs-on: 1-gpu-runner
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: 1-gpu-runner
     steps:
@@ -319,6 +322,7 @@ jobs:
       !inputs.target_stage &&
       needs.check-changes.outputs.sgl_kernel == 'true'
     runs-on: 1-gpu-runner
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: 1-gpu-runner
     steps:
@@ -355,6 +359,7 @@ jobs:
       !inputs.target_stage &&
       needs.check-changes.outputs.sgl_kernel == 'true'
     runs-on: 1-gpu-runner
+    timeout-minutes: 60
     env:
       CI: true
       RUNNER_LABELS: 1-gpu-runner
@@ -404,6 +409,7 @@ jobs:
       !inputs.target_stage &&
       needs.check-changes.outputs.sgl_kernel == 'true'
     runs-on: ${{ needs.check-changes.outputs.b200_runner }}
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
     steps:
@@ -473,6 +479,7 @@ jobs:
       !inputs.target_stage &&
       needs.check-changes.outputs.jit_kernel == 'true'
     runs-on: 1-gpu-runner
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: 1-gpu-runner
     steps:
@@ -506,6 +513,7 @@ jobs:
         )
       )
     runs-on: 1-gpu-runner
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: 1-gpu-runner
     steps:
@@ -552,6 +560,7 @@ jobs:
         )
       )
     runs-on: ubuntu-latest
+    timeout-minutes: 60
     steps:
       - name: Free disk space
         run: |
@@ -597,6 +606,7 @@ jobs:
         )
       )
     runs-on: 1-gpu-5090
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: 1-gpu-5090
       IS_BLACKWELL: "1"
@@ -650,6 +660,7 @@ jobs:
         )
       )
     runs-on: 1-gpu-runner
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: 1-gpu-runner
     strategy:
@@ -699,6 +710,7 @@ jobs:
         )
       )
     runs-on: 2-gpu-runner
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: 2-gpu-runner
     strategy:
@@ -734,24 +746,23 @@ jobs:
           fi
           python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 2 $CONTINUE_ON_ERROR_FLAG
 
-  stage-b-test-4-gpu-b200:
+  stage-b-test-small-1-gpu-performance:
     needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-4-gpu-b200') ||
+        (inputs.target_stage == 'stage-b-test-small-1-gpu-performance') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
           ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: ${{ needs.check-changes.outputs.b200_runner }}
+    runs-on: 1-gpu-5090
+    timeout-minutes: 60
     env:
-      RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
-    strategy:
-      fail-fast: false
-
+      RUNNER_LABELS: 1-gpu-5090
+      IS_BLACKWELL: "1"
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -760,7 +771,7 @@ jobs:
 
       - name: Download artifacts
         if: needs.check-changes.outputs.sgl_kernel == 'true'
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v4
         with:
           path: sgl-kernel/dist/
           merge-multiple: true
@@ -769,33 +780,40 @@ jobs:
       - name: Install dependencies
         timeout-minutes: 10
         run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
+          source /etc/profile.d/sglang-ci.sh
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 30
         run: |
-          cd test
+          source /etc/profile.d/sglang-ci.sh
+          cd test/
           CONTINUE_ON_ERROR_FLAG=""
           if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
             CONTINUE_ON_ERROR_FLAG="--continue-on-error"
           fi
-          IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-b-test-4-gpu-b200 $CONTINUE_ON_ERROR_FLAG
+          python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu-performance $CONTINUE_ON_ERROR_FLAG
 
-  stage-c-test-large-4-gpu:
-    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, sgl-kernel-build-wheels]
+  stage-b-test-large-1-gpu-performance:
+    needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-c-test-large-4-gpu') ||
+        (inputs.target_stage == 'stage-b-test-large-1-gpu-performance') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
           ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: 4-gpu-h100
+    runs-on: 1-gpu-runner
+    timeout-minutes: 60
     env:
-      RUNNER_LABELS: 4-gpu-h100
+      RUNNER_LABELS: 1-gpu-runner
+    strategy:
+      fail-fast: false
+      matrix:
+        partition: [0, 1]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -816,30 +834,31 @@ jobs:
           CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 30
+        timeout-minutes: 40
         run: |
           cd test/
           CONTINUE_ON_ERROR_FLAG=""
           if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
             CONTINUE_ON_ERROR_FLAG="--continue-on-error"
           fi
-          python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu $CONTINUE_ON_ERROR_FLAG
+          python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu-performance --auto-partition-id ${{ matrix.partition }} --auto-partition-size 2 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG
 
-  stage-c-test-large-4-gpu-b200:
-    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200, sgl-kernel-build-wheels]
+  stage-b-test-large-2-gpu-performance:
+    needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-c-test-large-4-gpu-b200') ||
+        (inputs.target_stage == 'stage-b-test-large-2-gpu-performance') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
           ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: ${{ needs.check-changes.outputs.b200_runner }}
+    runs-on: 2-gpu-runner
+    timeout-minutes: 60
     env:
-      RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
+      RUNNER_LABELS: 2-gpu-runner
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -848,7 +867,7 @@ jobs:
 
       - name: Download artifacts
         if: needs.check-changes.outputs.sgl_kernel == 'true'
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v4
         with:
           path: sgl-kernel/dist/
           merge-multiple: true
@@ -857,81 +876,35 @@ jobs:
       - name: Install dependencies
         timeout-minutes: 10
         run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
         timeout-minutes: 30
         run: |
           cd test/
-          IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu-b200
-
-  multimodal-gen-test-1-gpu:
-    needs: [check-changes, call-gate, sgl-kernel-build-wheels]
-    if: |
-      always() &&
-      (
-        (inputs.target_stage == 'multimodal-gen-test-1-gpu') ||
-        (
-          !inputs.target_stage &&
-          (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
-          needs.check-changes.outputs.multimodal_gen == 'true'
-        )
-      )
-    runs-on: 1-gpu-runner
-    strategy:
-      fail-fast: false
-      matrix:
-        part: [0, 1]
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
-
-      - name: Download artifacts
-        if: needs.check-changes.outputs.sgl_kernel == 'true'
-        uses: actions/download-artifact@v4
-        with:
-          path: sgl-kernel/dist/
-          merge-multiple: true
-          pattern: wheel-python3.10-cuda12.9
-
-      - name: Install dependencies
-        timeout-minutes: 10
-        run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion
-      - name: Run diffusion server tests
-        timeout-minutes: 60
-        run: |
-          cd python
           CONTINUE_ON_ERROR_FLAG=""
           if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
             CONTINUE_ON_ERROR_FLAG="--continue-on-error"
           fi
-          python3 sglang/multimodal_gen/test/run_suite.py \
-            --suite 1-gpu \
-            --partition-id ${{ matrix.part }} \
-            --total-partitions 2 \
-            $CONTINUE_ON_ERROR_FLAG
-
+          python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu-performance $CONTINUE_ON_ERROR_FLAG
 
-  multimodal-gen-test-2-gpu:
-    needs: [check-changes, call-gate, sgl-kernel-build-wheels]
+  stage-b-test-small-1-gpu-accuracy:
+    needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'multimodal-gen-test-2-gpu') ||
+        (inputs.target_stage == 'stage-b-test-small-1-gpu-accuracy') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
-          needs.check-changes.outputs.multimodal_gen == 'true'
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: 2-gpu-runner
-    strategy:
-      fail-fast: false
-      matrix:
-        part: [0, 1]
+    runs-on: 1-gpu-5090
+    timeout-minutes: 60
+    env:
+      RUNNER_LABELS: 1-gpu-5090
+      IS_BLACKWELL: "1"
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -949,87 +922,39 @@ jobs:
       - name: Install dependencies
         timeout-minutes: 10
         run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion
+          source /etc/profile.d/sglang-ci.sh
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
 
-      - name: Run diffusion server tests
-        timeout-minutes: 60
+      - name: Run test
+        timeout-minutes: 25
         run: |
-          cd python
+          source /etc/profile.d/sglang-ci.sh
+          cd test/
           CONTINUE_ON_ERROR_FLAG=""
           if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
             CONTINUE_ON_ERROR_FLAG="--continue-on-error"
           fi
-          python3 sglang/multimodal_gen/test/run_suite.py \
-            --suite 2-gpu \
-            --partition-id ${{ matrix.part }} \
-            --total-partitions 2 \
-            $CONTINUE_ON_ERROR_FLAG
+          python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu-accuracy $CONTINUE_ON_ERROR_FLAG
 
-  quantization-test:
-      needs: [check-changes, call-gate, stage-a-test-1]
-      if: |
-        always() &&
-        (
-          (inputs.target_stage == 'quantization-test') ||
-          (
-            !inputs.target_stage &&
-            (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
-            ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
-          )
-        )
-      runs-on: 1-gpu-runner
-      steps:
-        - name: Checkout code
-          uses: actions/checkout@v4
-          with:
-            ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
-
-        - name: Download artifacts
-          if: needs.check-changes.outputs.sgl_kernel == 'true'
-          uses: actions/download-artifact@v4
-          with:
-            path: sgl-kernel/dist/
-            merge-multiple: true
-            pattern: wheel-python3.10-cuda12.9
-
-        - name: Install dependencies
-          timeout-minutes: 10
-          run: |
-            CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
-
-        - name: Run test
-          timeout-minutes: 30
-          run: |
-            cd test/srt
-            RETRY_FLAG=""
-            if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
-              RETRY_FLAG="--enable-retry"
-            fi
-            CONTINUE_ON_ERROR_FLAG=""
-            if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
-              CONTINUE_ON_ERROR_FLAG="--continue-on-error"
-            fi
-            python3 run_suite.py --suite quantization_test $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
-
-  unit-test-backend-4-gpu:
-    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200]
+  stage-b-test-large-2-gpu-accuracy:
+    needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'unit-test-backend-4-gpu') ||
+        (inputs.target_stage == 'stage-b-test-large-2-gpu-accuracy') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
           ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: 4-gpu-h100
+    runs-on: 2-gpu-runner
+    timeout-minutes: 60
     env:
-      RUNNER_LABELS: 4-gpu-h100
-    strategy:
-      fail-fast: false
-      matrix:
-        part: [0, 1, 2]
+      RUNNER_LABELS: 2-gpu-runner
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -1048,40 +973,39 @@ jobs:
         timeout-minutes: 10
         run: |
           CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
 
       - name: Run test
-        timeout-minutes: 20
+        timeout-minutes: 25
         run: |
-          cd test/srt
-          RETRY_FLAG=""
-          if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
-            RETRY_FLAG="--enable-retry"
-          fi
+          cd test/
           CONTINUE_ON_ERROR_FLAG=""
           if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
             CONTINUE_ON_ERROR_FLAG="--continue-on-error"
           fi
-          python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
+          python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu-accuracy $CONTINUE_ON_ERROR_FLAG
 
-  unit-test-backend-8-gpu-h200:
-    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200]
+  stage-b-test-4-gpu-b200:
+    needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'unit-test-backend-8-gpu-h200') ||
+        (inputs.target_stage == 'stage-b-test-4-gpu-b200') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
           ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: 8-gpu-h200
+    runs-on: ${{ needs.check-changes.outputs.b200_runner }}
+    timeout-minutes: 60
     env:
-      RUNNER_LABELS: 8-gpu-h200
+      RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
     strategy:
       fail-fast: false
-      matrix:
-        part: [0, 1, 2, 3]
+
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -1090,7 +1014,7 @@ jobs:
 
       - name: Download artifacts
         if: needs.check-changes.outputs.sgl_kernel == 'true'
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v6
         with:
           path: sgl-kernel/dist/
           merge-multiple: true
@@ -1099,48 +1023,34 @@ jobs:
       - name: Install dependencies
         timeout-minutes: 10
         run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
-
-      # - name: Warmup Weights and JIT Compilation
-      #   timeout-minutes: 20
-      #   run: |
-      #     # An example command for testing the warmup. TODO: make this more general and move them to python scripts.
-      #     python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 20
+        timeout-minutes: 30
         run: |
-          cd test/srt
-          RETRY_FLAG=""
-          if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
-            RETRY_FLAG="--enable-retry"
-          fi
+          cd test
           CONTINUE_ON_ERROR_FLAG=""
           if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
             CONTINUE_ON_ERROR_FLAG="--continue-on-error"
           fi
-          python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
+          IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-b-test-4-gpu-b200 $CONTINUE_ON_ERROR_FLAG
 
-  unit-test-backend-8-gpu-h20:
-    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200]
+  stage-c-test-large-4-gpu:
+    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, sgl-kernel-build-wheels]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'unit-test-backend-8-gpu-h20') ||
+        (inputs.target_stage == 'stage-c-test-large-4-gpu') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
           ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: 8-gpu-h20
+    runs-on: 4-gpu-h100
+    timeout-minutes: 60
     env:
-      SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4"
-      RUNNER_LABELS: 8-gpu-h20
-    strategy:
-      fail-fast: false
-      matrix:
-        part: [0, 1]
+      RUNNER_LABELS: 4-gpu-h100
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -1158,37 +1068,34 @@ jobs:
       - name: Install dependencies
         timeout-minutes: 10
         run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 20
+        timeout-minutes: 30
         run: |
-          cd test/srt
-          RETRY_FLAG=""
-          if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
-            RETRY_FLAG="--enable-retry"
-          fi
+          cd test/
           CONTINUE_ON_ERROR_FLAG=""
           if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
             CONTINUE_ON_ERROR_FLAG="--continue-on-error"
           fi
-          python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
+          python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu $CONTINUE_ON_ERROR_FLAG
 
-  performance-test-1-gpu-part-1:
-    needs: [check-changes, call-gate, stage-a-test-1]
+  stage-c-test-large-4-gpu-b200:
+    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200, sgl-kernel-build-wheels]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'performance-test-1-gpu-part-1') ||
+        (inputs.target_stage == 'stage-c-test-large-4-gpu-b200') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
           ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: 1-gpu-runner
+    runs-on: ${{ needs.check-changes.outputs.b200_runner }}
+    timeout-minutes: 60
     env:
-      RUNNER_LABELS: 1-gpu-runner
+      RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -1197,7 +1104,7 @@ jobs:
 
       - name: Download artifacts
         if: needs.check-changes.outputs.sgl_kernel == 'true'
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v6
         with:
           path: sgl-kernel/dist/
           merge-multiple: true
@@ -1206,61 +1113,32 @@ jobs:
       - name: Install dependencies
         timeout-minutes: 10
         run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
-
-      - name: Benchmark single latency
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
-          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
-
-      - name: Benchmark online latency
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
-
-      - name: Benchmark offline throughput
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
-
-      - name: Benchmark offline throughput (Non-streaming, small batch size)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
-
-      - name: Benchmark online latency (EAGLE)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
 
-      - name: Benchmark online latency (LoRA)
-        timeout-minutes: 10
+      - name: Run test
+        timeout-minutes: 30
         run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
-          python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates
+          cd test/
+          IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu-b200
 
-  performance-test-1-gpu-part-2:
-    needs: [check-changes, call-gate, stage-a-test-1]
+  multimodal-gen-test-1-gpu:
+    needs: [check-changes, call-gate, sgl-kernel-build-wheels]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'performance-test-1-gpu-part-2') ||
+        (inputs.target_stage == 'multimodal-gen-test-1-gpu') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
-          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+          needs.check-changes.outputs.multimodal_gen == 'true'
         )
       )
     runs-on: 1-gpu-runner
-    env:
-      RUNNER_LABELS: 1-gpu-runner
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -1278,53 +1156,40 @@ jobs:
       - name: Install dependencies
         timeout-minutes: 10
         run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
-
-      - name: Benchmark offline throughput (w/o RadixAttention)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
-
-      - name: Benchmark offline throughput (w/ Triton)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
-
-      - name: Benchmark offline throughput (w/ FP8)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
-
-      - name: Benchmark VLM offline throughput
-        timeout-minutes: 10
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion
+      - name: Run diffusion server tests
+        timeout-minutes: 60
         run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput
+          cd python
+          CONTINUE_ON_ERROR_FLAG=""
+          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
+            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
+          fi
+          python3 sglang/multimodal_gen/test/run_suite.py \
+            --suite 1-gpu \
+            --partition-id ${{ matrix.part }} \
+            --total-partitions 2 \
+            $CONTINUE_ON_ERROR_FLAG
 
-      - name: Benchmark VLM online latency
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
 
-  performance-test-1-gpu-part-3:
-    needs: [check-changes, call-gate, stage-a-test-1]
+  multimodal-gen-test-2-gpu:
+    needs: [check-changes, call-gate, sgl-kernel-build-wheels]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'performance-test-1-gpu-part-3') ||
+        (inputs.target_stage == 'multimodal-gen-test-2-gpu') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
-          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+          needs.check-changes.outputs.multimodal_gen == 'true'
         )
       )
-    runs-on: 1-gpu-runner
-    env:
-      RUNNER_LABELS: 1-gpu-runner
+    runs-on: 2-gpu-runner
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -1342,47 +1207,42 @@ jobs:
       - name: Install dependencies
         timeout-minutes: 10
         run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
-
-      - name: Benchmark Scores online latency and throughput
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput
-
-      - name: Benchmark Scores online latency and throughput (batch size scaling)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling
-
-      - name: Benchmark Embeddings online latency and throughput
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_embeddings_api_latency_throughput
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion
 
-      - name: Benchmark Embeddings online latency and throughput (batch size scaling)
-        timeout-minutes: 10
+      - name: Run diffusion server tests
+        timeout-minutes: 60
         run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_embeddings_api_batch_scaling
+          cd python
+          CONTINUE_ON_ERROR_FLAG=""
+          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
+            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
+          fi
+          python3 sglang/multimodal_gen/test/run_suite.py \
+            --suite 2-gpu \
+            --partition-id ${{ matrix.part }} \
+            --total-partitions 2 \
+            $CONTINUE_ON_ERROR_FLAG
 
-  performance-test-2-gpu:
-    needs: [check-changes, call-gate, stage-b-test-4-gpu-b200]
+  unit-test-backend-4-gpu:
+    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'performance-test-2-gpu') ||
+        (inputs.target_stage == 'unit-test-backend-4-gpu') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
           ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: 2-gpu-runner
+    runs-on: 4-gpu-h100
+    timeout-minutes: 60
     env:
-      RUNNER_LABELS: 2-gpu-runner
+      RUNNER_LABELS: 4-gpu-h100
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1, 2]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -1402,57 +1262,40 @@ jobs:
         run: |
           CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
 
-      - name: Benchmark single latency (TP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
-
-      - name: Benchmark single latency + torch.compile (TP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
-
-      - name: Benchmark offline throughput (TP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
-
-      - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
-
-      - name: Benchmark offline PP decode throughput (PP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode
-
-      - name: Benchmark offline PP prefill throughput (PP=2)
-        timeout-minutes: 10
+      - name: Run test
+        timeout-minutes: 20
         run: |
           cd test/srt
-          python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill
+          RETRY_FLAG=""
+          if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
+            RETRY_FLAG="--enable-retry"
+          fi
+          CONTINUE_ON_ERROR_FLAG=""
+          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
+            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
+          fi
+          python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
 
-  accuracy-test-1-gpu:
-    needs: [check-changes, call-gate, stage-a-test-1]
+  unit-test-backend-8-gpu-h200:
+    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'accuracy-test-1-gpu') ||
+        (inputs.target_stage == 'unit-test-backend-8-gpu-h200') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
           ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: 1-gpu-runner
+    runs-on: 8-gpu-h200
+    timeout-minutes: 60
     env:
-      RUNNER_LABELS: 1-gpu-runner
+      RUNNER_LABELS: 8-gpu-h200
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1, 2, 3]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -1471,31 +1314,48 @@ jobs:
         timeout-minutes: 10
         run: |
           CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
-          git clone https://github.com/merrymercy/human-eval.git
-          cd human-eval
-          pip install -e .
 
-      - name: Evaluate accuracy
-        timeout-minutes: 25
+      # - name: Warmup Weights and JIT Compilation
+      #   timeout-minutes: 20
+      #   run: |
+      #     # An example command for testing the warmup. TODO: make this more general and move them to python scripts.
+      #     python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code
+
+      - name: Run test
+        timeout-minutes: 20
         run: |
           cd test/srt
-          python3 -m sglang.test.ci.run_with_retry test_eval_accuracy_large.py
+          RETRY_FLAG=""
+          if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
+            RETRY_FLAG="--enable-retry"
+          fi
+          CONTINUE_ON_ERROR_FLAG=""
+          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
+            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
+          fi
+          python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
 
-  accuracy-test-2-gpu:
-    needs: [check-changes, call-gate, accuracy-test-1-gpu]
+  unit-test-backend-8-gpu-h20:
+    needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'accuracy-test-2-gpu') ||
+        (inputs.target_stage == 'unit-test-backend-8-gpu-h20') ||
         (
           !inputs.target_stage &&
           (github.event_name == 'schedule' || (!failure() && !cancelled())) &&
           ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
         )
       )
-    runs-on: 2-gpu-runner
+    runs-on: 8-gpu-h20
+    timeout-minutes: 60
     env:
-      RUNNER_LABELS: 2-gpu-runner
+      SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4"
+      RUNNER_LABELS: 8-gpu-h20
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -1513,16 +1373,21 @@ jobs:
       - name: Install dependencies
         timeout-minutes: 10
         run: |
-          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
-          git clone https://github.com/merrymercy/human-eval.git
-          cd human-eval
-          pip install -e .
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh
 
-      - name: Evaluate accuracy (TP=2)
-        timeout-minutes: 25
+      - name: Run test
+        timeout-minutes: 20
         run: |
           cd test/srt
-          python3 -m sglang.test.ci.run_with_retry test_moe_eval_accuracy_large.py
+          RETRY_FLAG=""
+          if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then
+            RETRY_FLAG="--enable-retry"
+          fi
+          CONTINUE_ON_ERROR_FLAG=""
+          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
+            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
+          fi
+          python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG
 
   unit-test-deepep-4-gpu:
     needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200]
@@ -1537,6 +1402,7 @@ jobs:
         )
       )
     runs-on: 4-gpu-h100
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: 4-gpu-h100
     steps:
@@ -1634,6 +1500,7 @@ jobs:
         )
       )
     runs-on: ${{ needs.check-changes.outputs.b200_runner }}
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
     strategy:
@@ -1687,6 +1554,7 @@ jobs:
         )
       )
     runs-on: 4-gpu-gb200
+    timeout-minutes: 60
     env:
       RUNNER_LABELS: 4-gpu-gb200
     strategy:
@@ -1746,18 +1614,16 @@ jobs:
         stage-b-test-small-1-gpu,
         stage-b-test-large-1-gpu,
         stage-b-test-large-2-gpu,
+        stage-b-test-small-1-gpu-performance,
+        stage-b-test-large-1-gpu-performance,
+        stage-b-test-large-2-gpu-performance,
+        stage-b-test-small-1-gpu-accuracy,
+        stage-b-test-large-2-gpu-accuracy,
         stage-c-test-large-4-gpu,
-        quantization-test,
         stage-b-test-4-gpu-b200,
         unit-test-backend-4-gpu,
         unit-test-backend-8-gpu-h20,
         unit-test-backend-8-gpu-h200,
-        performance-test-1-gpu-part-1,
-        performance-test-1-gpu-part-2,
-        performance-test-1-gpu-part-3,
-        performance-test-2-gpu,
-        accuracy-test-1-gpu,
-        accuracy-test-2-gpu,
         unit-test-deepep-4-gpu,
         # unit-test-deepep-8-gpu,  # Disabled, see #17175
         unit-test-backend-4-gpu-b200,
diff --git a/test/srt/test_eval_accuracy_large.py b/test/registered/eval/test_eval_accuracy_large.py
similarity index 94%
rename from test/srt/test_eval_accuracy_large.py
rename to test/registered/eval/test_eval_accuracy_large.py
index efb202463e19..4de8f61fa138 100644
--- a/test/srt/test_eval_accuracy_large.py
+++ b/test/registered/eval/test_eval_accuracy_large.py
@@ -7,6 +7,7 @@
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
@@ -18,6 +19,8 @@
     write_github_step_summary,
 )
 
+register_cuda_ci(est_time=300, suite="stage-b-test-small-1-gpu-accuracy")
+
 
 class TestEvalAccuracyLarge(CustomTestCase):
     @classmethod
diff --git a/test/srt/test_moe_eval_accuracy_large.py b/test/registered/eval/test_moe_eval_accuracy_large.py
similarity index 94%
rename from test/srt/test_moe_eval_accuracy_large.py
rename to test/registered/eval/test_moe_eval_accuracy_large.py
index 26bbd247e1dd..76d1bf185526 100644
--- a/test/srt/test_moe_eval_accuracy_large.py
+++ b/test/registered/eval/test_moe_eval_accuracy_large.py
@@ -7,6 +7,7 @@
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
@@ -18,6 +19,8 @@
     write_github_step_summary,
 )
 
+register_cuda_ci(est_time=500, suite="stage-b-test-large-2-gpu-accuracy")
+
 
 class TestMoEEvalAccuracyLarge(CustomTestCase):
     @classmethod
diff --git a/test/registered/perf/test_bench_one_batch_1gpu.py b/test/registered/perf/test_bench_one_batch_1gpu.py
new file mode 100644
index 000000000000..fbcc2ec05aec
--- /dev/null
+++ b/test/registered/perf/test_bench_one_batch_1gpu.py
@@ -0,0 +1,39 @@
+import unittest
+
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    run_bench_offline_throughput,
+    run_bench_one_batch,
+    write_github_step_summary,
+)
+
+register_cuda_ci(est_time=120, suite="stage-b-test-large-1-gpu-performance")
+
+
+class TestBenchOneBatch1GPU(CustomTestCase):
+
+    def test_bs1_small(self):
+        _, output_throughput, _ = run_bench_one_batch(
+            DEFAULT_SMALL_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"]
+        )
+        self.assertGreater(output_throughput, 50)
+
+    def test_bs1_default(self):
+        output_throughput = run_bench_offline_throughput(
+            DEFAULT_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"]
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_bs1_default (llama-3.1-8b)\n"
+                f"output_throughput: {output_throughput:.2f} token/s\n"
+            )
+            self.assertGreater(output_throughput, 135)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/test_bench_one_batch.py b/test/registered/perf/test_bench_one_batch_2gpu.py
similarity index 61%
rename from test/srt/test_bench_one_batch.py
rename to test/registered/perf/test_bench_one_batch_2gpu.py
index 8d14cd0b7098..9af873bea9a0 100644
--- a/test/srt/test_bench_one_batch.py
+++ b/test/registered/perf/test_bench_one_batch_2gpu.py
@@ -1,40 +1,20 @@
 import unittest
 
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
-    DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
     CustomTestCase,
     is_in_amd_ci,
     is_in_ci,
     run_bench_offline_throughput,
-    run_bench_one_batch,
     write_github_step_summary,
 )
 
-# We use `run_bench_offline_throughput`` instead of `run_bench_one_batch` for most cases
-# because `run_bench_offline_throughput`` has overlap scheduler.
+register_cuda_ci(est_time=180, suite="stage-b-test-large-2-gpu-performance")
 
 
-class TestBenchOneBatch(CustomTestCase):
-
-    def test_bs1_small(self):
-        _, output_throughput, _ = run_bench_one_batch(
-            DEFAULT_SMALL_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"]
-        )
-        self.assertGreater(output_throughput, 50)
-
-    def test_bs1_default(self):
-        output_throughput = run_bench_offline_throughput(
-            DEFAULT_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"]
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_bs1_default (llama-3.1-8b)\n"
-                f"output_throughput: {output_throughput:.2f} token/s\n"
-            )
-            self.assertGreater(output_throughput, 135)
+class TestBenchOneBatch2GPU(CustomTestCase):
 
     def test_moe_tp2_bs1(self):
         output_throughput = run_bench_offline_throughput(
diff --git a/test/registered/perf/test_bench_serving_1gpu_large.py b/test/registered/perf/test_bench_serving_1gpu_large.py
new file mode 100644
index 000000000000..68860a015b7e
--- /dev/null
+++ b/test/registered/perf/test_bench_serving_1gpu_large.py
@@ -0,0 +1,81 @@
+"""
+Performance tests for single GPU that need H200 (80GB) - FP8 and EAGLE tests.
+"""
+
+import unittest
+
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.test_utils import (
+    DEFAULT_DRAFT_MODEL_EAGLE,
+    DEFAULT_MODEL_NAME_FOR_TEST_FP8,
+    DEFAULT_TARGET_MODEL_EAGLE,
+    CustomTestCase,
+    is_in_amd_ci,
+    is_in_ci,
+    run_bench_serving,
+    write_github_step_summary,
+)
+
+register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu-performance")
+
+
+class TestBenchServing1GPULarge(CustomTestCase):
+    def test_offline_throughput_default_fp8(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST_FP8,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_default_fp8\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 3500)
+            else:
+                self.assertGreater(res["output_throughput"], 4300)
+
+    def test_online_latency_eagle(self):
+        res = run_bench_serving(
+            model=DEFAULT_TARGET_MODEL_EAGLE,
+            num_prompts=300,
+            request_rate=8,
+            sharegpt_context_len=3072,
+            disable_ignore_eos=True,
+            dataset_name="sharegpt",
+            other_server_args=[
+                "--speculative-algorithm",
+                "EAGLE",
+                "--speculative-draft-model-path",
+                DEFAULT_DRAFT_MODEL_EAGLE,
+                "--speculative-num-steps",
+                "5",
+                "--speculative-eagle-topk",
+                "4",
+                "--speculative-num-draft-tokens",
+                "16",
+                "--mem-fraction-static",
+                "0.7",
+            ],
+            need_warmup=True,
+            seed=42,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_online_latency_eagle\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+                f"accept_length: {res['accept_length']:.2f} \n"
+            )
+            if is_in_amd_ci():
+                self.assertLess(res["median_e2e_latency_ms"], 1800)
+            else:
+                self.assertLess(res["median_e2e_latency_ms"], 900)
+            self.assertGreater(res["accept_length"], 3.0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/perf/test_bench_serving_1gpu_part1.py b/test/registered/perf/test_bench_serving_1gpu_part1.py
new file mode 100644
index 000000000000..76c7e7e6b83d
--- /dev/null
+++ b/test/registered/perf/test_bench_serving_1gpu_part1.py
@@ -0,0 +1,258 @@
+"""
+Performance tests for single GPU - LLM throughput/latency and LoRA tests.
+Works on 5090 (32GB).
+"""
+
+import asyncio
+import itertools
+import unittest
+
+import requests
+
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    is_in_ci,
+    run_bench_serving,
+    write_github_step_summary,
+)
+
+register_cuda_ci(est_time=1000, suite="stage-b-test-large-1-gpu-performance")
+
+
+class TestBenchServing1GPUPart1(CustomTestCase):
+    def test_offline_throughput_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_default\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 3050)
+            else:
+                self.assertGreater(res["output_throughput"], 3800)
+
+    def test_offline_throughput_non_stream_small_batch_size(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=200,
+            request_rate=float("inf"),
+            other_server_args=["--max-running-requests", "10"],
+            dataset_name="sharegpt",
+            random_input_len=None,
+            random_output_len=None,
+            disable_stream=True,
+            need_warmup=True,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_non_stream_small_batch_size\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 1000)
+            else:
+                self.assertGreater(res["output_throughput"], 1050)
+
+    def test_offline_throughput_without_radix_cache(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=["--disable-radix-cache"],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_without_radix_cache\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 3050)
+            else:
+                self.assertGreater(res["output_throughput"], 3800)
+
+    def test_offline_throughput_without_chunked_prefill(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=["--chunked-prefill-size", "-1"],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_without_chunked_prefill\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            self.assertGreater(res["output_throughput"], 2600)
+
+    def test_offline_throughput_with_triton_attention_backend(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[
+                "--attention-backend",
+                "triton",
+                "--context-length",
+                "8192",
+            ],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_offline_throughput_with_triton_attention_backend\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 3500)
+            else:
+                self.assertGreater(res["output_throughput"], 3700)
+
+    def test_online_latency_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=100,
+            request_rate=1,
+            other_server_args=[],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_online_latency_default\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 11000)
+            if is_in_amd_ci():
+                self.assertLess(res["median_ttft_ms"], 115)
+            else:
+                self.assertLess(res["median_ttft_ms"], 86)
+            self.assertLess(res["median_itl_ms"], 10)
+
+    def test_lora_online_latency(self):
+        if is_in_amd_ci():
+            pass
+
+        res = self._run_lora_latency_test(enable_background_task=False)
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_lora_online_latency\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+                f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 2400)
+            self.assertLess(res["median_ttft_ms"], 58)
+
+    def test_lora_online_latency_with_concurrent_adapter_updates(self):
+        if is_in_amd_ci():
+            pass
+
+        res = self._run_lora_latency_test(enable_background_task=True)
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_lora_online_latency\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+                f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 4000)
+            self.assertLess(res["median_ttft_ms"], 80)
+
+    def _run_lora_latency_test(self, enable_background_task: bool):
+        """
+        Run a latency test for LoRA with the specified background task setting.
+        """
+
+        async def lora_loader_unloader_task(
+            base_url: str,
+            start_event: asyncio.Event,
+            stop_event: asyncio.Event,
+        ):
+            """
+            A background task that repeatedly loads and unloads a LoRA adapter.
+            """
+            await start_event.wait()
+
+            path_cycler = itertools.cycle(
+                [
+                    "pbevan11/llama-3.1-8b-ocr-correction",
+                    "faridlazuarda/valadapt-llama-3.1-8B-it-chinese",
+                    "philschmid/code-llama-3-1-8b-text-to-sql-lora",
+                ]
+            )
+            load_url = f"{base_url}/load_lora_adapter"
+            unload_url = f"{base_url}/unload_lora_adapter"
+            num_updates = 0
+
+            while not stop_event.is_set():
+                lora_path = next(path_cycler)
+                response = await asyncio.to_thread(
+                    requests.post,
+                    load_url,
+                    json={"lora_name": lora_path, "lora_path": lora_path},
+                )
+                self.assertTrue(
+                    response.ok, f"Failed to load LoRA adapter: {response.text}"
+                )
+                num_updates += 1
+
+                if stop_event.is_set():
+                    break
+
+                await asyncio.sleep(1)
+
+                response = await asyncio.to_thread(
+                    requests.post,
+                    unload_url,
+                    json={"lora_name": lora_path},
+                )
+                self.assertTrue(
+                    response.ok, f"Failed to unload LoRA adapter: {response.text}"
+                )
+                num_updates += 1
+
+                await asyncio.sleep(1)
+
+        background_task = lora_loader_unloader_task if enable_background_task else None
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=400,
+            request_rate=8,
+            other_server_args=[
+                "--enable-lora",
+                "--max-loras-per-batch",
+                "1",
+                "--disable-radix-cache",
+                "--random-seed",
+                "42",
+                "--mem-fraction-static",
+                "0.8",
+                "--lora-paths",
+                "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
+                "--max-lora-rank",
+                "256",
+            ],
+            dataset_name="random",
+            random_input_len=256,
+            random_output_len=256,
+            lora_name=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"],
+            background_task=background_task,
+        )
+
+        return res
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/perf/test_bench_serving_1gpu_part2.py b/test/registered/perf/test_bench_serving_1gpu_part2.py
new file mode 100644
index 000000000000..d33522233ca9
--- /dev/null
+++ b/test/registered/perf/test_bench_serving_1gpu_part2.py
@@ -0,0 +1,186 @@
+"""
+Performance tests for single GPU - VLM, Score API, and Embeddings API tests.
+Works on 5090 (32GB).
+"""
+
+import unittest
+
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
+    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
+    DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    is_in_ci,
+    run_bench_serving,
+    run_embeddings_benchmark,
+    run_score_benchmark,
+    write_github_step_summary,
+)
+
+register_cuda_ci(est_time=900, suite="stage-b-test-large-1-gpu-performance")
+
+
+class TestBenchServing1GPUPart2(CustomTestCase):
+    def test_vlm_offline_throughput(self):
+        res = run_bench_serving(
+            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
+            num_prompts=200,
+            request_rate=float("inf"),
+            other_server_args=[
+                "--mem-fraction-static",
+                "0.7",
+            ],
+            dataset_name="mmmu",
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_vlm_offline_throughput\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 2000)
+            else:
+                self.assertGreater(res["output_throughput"], 2500)
+
+    def test_vlm_online_latency(self):
+        res = run_bench_serving(
+            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
+            num_prompts=250,
+            request_rate=1,
+            other_server_args=[
+                "--mem-fraction-static",
+                "0.7",
+            ],
+            dataset_name="mmmu",
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_vlm_online_latency\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 16500)
+            if is_in_amd_ci():
+                self.assertLess(res["median_ttft_ms"], 150)
+            else:
+                self.assertLess(res["median_ttft_ms"], 100)
+            self.assertLess(res["median_itl_ms"], 8)
+
+    def test_score_api_latency_throughput(self):
+        """Test score API latency and throughput performance"""
+        res = run_score_benchmark(
+            model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
+            num_requests=1000,
+            batch_size=10,
+            other_server_args=[],
+            need_warmup=True,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_score_api_throughput\n"
+                f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
+                f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
+                f"Score API throughput: {res['throughput']:.2f} req/s\n"
+                f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
+            )
+
+        self.assertEqual(res["successful_requests"], res["total_requests"])
+        self.assertLess(res["avg_latency_ms"], 48)
+        self.assertLess(res["p95_latency_ms"], 50)
+        self.assertGreater(res["throughput"], 20)
+
+    def test_score_api_batch_scaling(self):
+        """Test score API performance with different batch sizes"""
+        batch_sizes = [10, 25, 50]
+
+        for batch_size in batch_sizes:
+            res = run_score_benchmark(
+                model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
+                num_requests=500,
+                batch_size=batch_size,
+            )
+
+            if is_in_ci():
+                write_github_step_summary(
+                    f"### test_score_api_batch_scaling_size_{batch_size}\n"
+                    f"Batch size: {batch_size}\n"
+                    f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
+                    f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
+                    f"Throughput: {res['throughput']:.2f} req/s\n"
+                    f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
+                )
+
+            self.assertEqual(res["successful_requests"], res["total_requests"])
+            bounds = {
+                10: (45, 50),
+                25: (50, 60),
+                50: (60, 65),
+            }
+            avg_latency_bound, p95_latency_bound = bounds.get(batch_size, (60, 65))
+            self.assertLess(res["avg_latency_ms"], avg_latency_bound)
+            self.assertLess(res["p95_latency_ms"], p95_latency_bound)
+
+    def test_embeddings_api_latency_throughput(self):
+        """Test embeddings API latency and throughput performance"""
+        res = run_embeddings_benchmark(
+            model=DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
+            num_requests=1000,
+            batch_size=1,
+            input_tokens=500,
+            other_server_args=[],
+            need_warmup=True,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_embeddings_api_throughput\n"
+                f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
+                f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
+                f"Embeddings API throughput: {res['throughput']:.2f} req/s\n"
+                f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
+            )
+
+        self.assertEqual(res["successful_requests"], res["total_requests"])
+        self.assertLess(res["avg_latency_ms"], 20)
+        self.assertLess(res["p95_latency_ms"], 25)
+        self.assertGreater(res["throughput"], 60)
+
+    def test_embeddings_api_batch_scaling(self):
+        """Test embeddings API performance with different batch sizes"""
+        batch_sizes = [10, 25, 50]
+
+        for batch_size in batch_sizes:
+            res = run_embeddings_benchmark(
+                model=DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
+                num_requests=500,
+                batch_size=batch_size,
+                input_tokens=500,
+            )
+
+            if is_in_ci():
+                write_github_step_summary(
+                    f"### test_embeddings_api_batch_scaling_size_{batch_size}\n"
+                    f"Batch size: {batch_size}\n"
+                    f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
+                    f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
+                    f"Throughput: {res['throughput']:.2f} req/s\n"
+                    f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
+                )
+
+            self.assertEqual(res["successful_requests"], res["total_requests"])
+            bounds = {
+                10: (60, 65),
+                25: (115, 120),
+                50: (190, 195),
+            }
+            avg_latency_bound, p95_latency_bound = bounds.get(batch_size, (250, 250))
+            self.assertLess(res["avg_latency_ms"], avg_latency_bound)
+            self.assertLess(res["p95_latency_ms"], p95_latency_bound)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/perf/test_bench_serving_2gpu.py b/test/registered/perf/test_bench_serving_2gpu.py
new file mode 100644
index 000000000000..51c7358f0402
--- /dev/null
+++ b/test/registered/perf/test_bench_serving_2gpu.py
@@ -0,0 +1,107 @@
+"""
+Performance tests for 2-GPU that need large GPUs (H200 80GB) - MoE and Pipeline Parallel tests.
+"""
+
+import unittest
+
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.test_utils import (
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    is_in_ci,
+    run_bench_serving,
+    write_github_step_summary,
+)
+
+register_cuda_ci(est_time=600, suite="stage-b-test-large-2-gpu-performance")
+
+
+class TestBenchServing2GPU(CustomTestCase):
+    def test_moe_offline_throughput_default(self):
+        res = run_bench_serving(
+            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            num_prompts=300,
+            request_rate=float("inf"),
+            other_server_args=["--tp", "2"],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_moe_offline_throughput_default\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 2100)
+            else:
+                self.assertGreater(res["output_throughput"], 2200)
+
+    def test_moe_offline_throughput_without_radix_cache(self):
+        res = run_bench_serving(
+            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            num_prompts=300,
+            request_rate=float("inf"),
+            other_server_args=["--tp", "2", "--disable-radix-cache"],
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_moe_offline_throughput_without_radix_cache\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["output_throughput"], 2100)
+            else:
+                self.assertGreater(res["output_throughput"], 2200)
+
+    def test_pp_offline_throughput_default_decode(self):
+        res = run_bench_serving(
+            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            num_prompts=1000,
+            request_rate=float("inf"),
+            random_input_len=1,
+            random_output_len=1024,
+            other_server_args=["--pp-size", "2"],
+            need_warmup=True,
+            seed=42,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_pp_offline_throughput_default_decode\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            self.assertGreater(res["output_throughput"], 6700)
+
+    def test_pp_long_context_prefill(self):
+        res = run_bench_serving(
+            model="meta-llama/Llama-3.3-70B-Instruct",
+            num_prompts=4,
+            request_rate=float("inf"),
+            random_input_len=128000,
+            random_output_len=1,
+            dataset_name="random",
+            other_server_args=[
+                "--quantization",
+                "fp8",
+                "--pp-size",
+                "2",
+            ]
+            + (["--mem-fraction-static", "0.7"] if is_in_amd_ci() else []),
+            need_warmup=False,
+            seed=42,
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_pp_long_context_latency_prefill\n"
+                f"input_throughput: {res['input_throughput']:.2f} ms\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(res["input_throughput"], 3000)
+            else:
+                self.assertGreater(res["input_throughput"], 4000)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/perf/test_vlm_perf_5090.py b/test/registered/perf/test_vlm_perf_5090.py
new file mode 100644
index 000000000000..772dc90d046f
--- /dev/null
+++ b/test/registered/perf/test_vlm_perf_5090.py
@@ -0,0 +1,62 @@
+"""
+VLM Performance tests that work on 5090 (32GB) - VLM offline throughput and online latency tests.
+"""
+
+import unittest
+
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.test_utils import (
+    DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    run_bench_serving,
+    write_github_step_summary,
+)
+
+register_cuda_ci(est_time=600, suite="stage-b-test-small-1-gpu-performance")
+
+
+class TestVLMPerf5090(CustomTestCase):
+    def test_vlm_offline_throughput(self):
+        res = run_bench_serving(
+            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
+            num_prompts=200,
+            request_rate=float("inf"),
+            other_server_args=[
+                "--mem-fraction-static",
+                "0.7",
+            ],
+            dataset_name="mmmu",
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_vlm_offline_throughput (5090)\n"
+                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
+            )
+            self.assertGreater(res["output_throughput"], 2000)
+
+    def test_vlm_online_latency(self):
+        res = run_bench_serving(
+            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
+            num_prompts=250,
+            request_rate=1,
+            other_server_args=[
+                "--mem-fraction-static",
+                "0.7",
+            ],
+            dataset_name="mmmu",
+        )
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_vlm_online_latency (5090)\n"
+                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
+            )
+            self.assertLess(res["median_e2e_latency_ms"], 16500)
+            self.assertLess(res["median_ttft_ms"], 150)
+            self.assertLess(res["median_itl_ms"], 8)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/quant/test_awq.py b/test/registered/quant/test_awq.py
similarity index 96%
rename from test/srt/quant/test_awq.py
rename to test/registered/quant/test_awq.py
index 87e126adb048..42d2e7f523bb 100644
--- a/test/srt/quant/test_awq.py
+++ b/test/registered/quant/test_awq.py
@@ -2,6 +2,7 @@
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST,
@@ -11,6 +12,8 @@
     popen_launch_server,
 )
 
+register_cuda_ci(est_time=163, suite="stage-b-test-large-1-gpu")
+
 
 class TestAWQ(CustomTestCase):
     @classmethod
diff --git a/test/srt/test_bnb.py b/test/registered/quant/test_bnb.py
similarity index 98%
rename from test/srt/test_bnb.py
rename to test/registered/quant/test_bnb.py
index 4328d56be965..814ec6a5e1b1 100644
--- a/test/srt/test_bnb.py
+++ b/test/registered/quant/test_bnb.py
@@ -12,6 +12,7 @@
 import openai
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -21,6 +22,8 @@
     popen_launch_server,
 )
 
+register_cuda_ci(est_time=5, suite="stage-b-test-small-1-gpu")
+
 VISION_MODELS = [
     "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",
     "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
diff --git a/test/srt/test_gguf.py b/test/registered/quant/test_gguf.py
similarity index 86%
rename from test/srt/test_gguf.py
rename to test/registered/quant/test_gguf.py
index e9776067ca9d..14448bf9b149 100644
--- a/test/srt/test_gguf.py
+++ b/test/registered/quant/test_gguf.py
@@ -3,8 +3,11 @@
 from huggingface_hub import hf_hub_download
 
 import sglang as sgl
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_utils import CustomTestCase
 
+register_cuda_ci(est_time=96, suite="stage-b-test-small-1-gpu")
+
 
 class TestGGUF(CustomTestCase):
     def test_models(self):
diff --git a/test/srt/test_gptqmodel_dynamic.py b/test/registered/quant/test_gptqmodel_dynamic.py
similarity index 98%
rename from test/srt/test_gptqmodel_dynamic.py
rename to test/registered/quant/test_gptqmodel_dynamic.py
index ea141df3e377..dd8fd51c989c 100644
--- a/test/srt/test_gptqmodel_dynamic.py
+++ b/test/registered/quant/test_gptqmodel_dynamic.py
@@ -6,6 +6,7 @@
 
 from sglang.srt.server_args import set_global_server_args_for_scheduler
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
@@ -13,6 +14,8 @@
     popen_launch_server,
 )
 
+register_cuda_ci(est_time=102, suite="stage-b-test-large-1-gpu")
+
 
 def check_quant_method(model_path: str, use_marlin_kernel: bool):
     from sglang.srt.configs.device_config import DeviceConfig
diff --git a/test/srt/quant/test_marlin_moe.py b/test/registered/quant/test_marlin_moe.py
similarity index 99%
rename from test/srt/quant/test_marlin_moe.py
rename to test/registered/quant/test_marlin_moe.py
index b1eb9c2da1e1..a37cf9fd191c 100644
--- a/test/srt/quant/test_marlin_moe.py
+++ b/test/registered/quant/test_marlin_moe.py
@@ -8,9 +8,12 @@
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.moe.fused_moe_triton.fused_marlin_moe import fused_marlin_moe
 from sglang.srt.server_args import ServerArgs, set_global_server_args_for_scheduler
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize
 from sglang.test.test_utils import CustomTestCase
 
+register_cuda_ci(est_time=200, suite="stage-b-test-small-1-gpu")
+
 set_global_server_args_for_scheduler(object.__new__(ServerArgs))
 
 
diff --git a/test/srt/test_quantization.py b/test/registered/quant/test_quantization.py
similarity index 97%
rename from test/srt/test_quantization.py
rename to test/registered/quant/test_quantization.py
index a38dd61ff2b0..770b3855ab35 100644
--- a/test/srt/test_quantization.py
+++ b/test/registered/quant/test_quantization.py
@@ -4,6 +4,7 @@
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_cuda_ci
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1,
@@ -15,6 +16,8 @@
     write_results_to_json,
 )
 
+register_cuda_ci(est_time=185, suite="stage-b-test-large-1-gpu")
+
 MODEL_SCORE_THRESHOLDS = {
     "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.825,
     "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.825,
diff --git a/test/run_suite.py b/test/run_suite.py
index e12db55dbf18..815eead31102 100644
--- a/test/run_suite.py
+++ b/test/run_suite.py
@@ -28,8 +28,12 @@
     HWBackend.CUDA: [
         "stage-a-test-1",
         "stage-b-test-small-1-gpu",
+        "stage-b-test-small-1-gpu-performance",
+        "stage-b-test-small-1-gpu-accuracy",
         "stage-b-test-large-1-gpu",
+        "stage-b-test-large-1-gpu-performance",
         "stage-b-test-large-2-gpu",
+        "stage-b-test-large-2-gpu-performance",
         "stage-c-test-large-4-gpu",
         "stage-b-test-4-gpu-b200",
         "stage-c-test-large-4-gpu-b200",
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index fbc7c8154476..bf47a18e998d 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -53,21 +53,10 @@
     # "per-commit-8-gpu-h200-deepep": [
     #     TestFile("ep/test_deepep_large.py", 563),
     # ],
-    "quantization_test": [
-        TestFile("quant/test_awq.py", 163),
-        TestFile("quant/test_marlin_moe.py", 200),
-        TestFile("test_bnb.py", 5),
-        TestFile("test_gptqmodel_dynamic.py", 102),
-        TestFile("test_quantization.py", 185),
-        TestFile("test_gguf.py", 96),
-    ],
+    # quantization_test suite migrated to test/registered/quant/
     "__not_in_ci__": [
         TestFile("test_release_memory_occupation.py", 200),  # Temporarily disabled
         TestFile("models/test_dummy_grok_models.py"),
-        TestFile("test_bench_one_batch.py"),
-        TestFile("test_bench_serving.py"),
-        TestFile("test_eval_accuracy_large.py"),
-        TestFile("test_moe_eval_accuracy_large.py"),
         TestFile("test_profile_v2.py"),
         TestFile("models/test_ministral3_models.py"),
         TestFile("test_mistral_large3_basic.py"),
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
deleted file mode 100644
index 9411a839695c..000000000000
--- a/test/srt/test_bench_serving.py
+++ /dev/null
@@ -1,566 +0,0 @@
-import asyncio
-import itertools
-import unittest
-
-import requests
-
-from sglang.test.test_utils import (
-    DEFAULT_DRAFT_MODEL_EAGLE,
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_MODEL_NAME_FOR_TEST_FP8,
-    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
-    DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
-    DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
-    DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
-    DEFAULT_TARGET_MODEL_EAGLE,
-    CustomTestCase,
-    is_in_amd_ci,
-    is_in_ci,
-    run_bench_serving,
-    run_embeddings_benchmark,
-    run_score_benchmark,
-    write_github_step_summary,
-)
-
-
-class TestBenchServing(CustomTestCase):
-    def test_offline_throughput_default(self):
-        res = run_bench_serving(
-            model=DEFAULT_MODEL_NAME_FOR_TEST,
-            num_prompts=500,
-            request_rate=float("inf"),
-            other_server_args=[],
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_offline_throughput_default\n"
-                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
-            )
-            if is_in_amd_ci():
-                self.assertGreater(res["output_throughput"], 3050)
-            else:
-                self.assertGreater(res["output_throughput"], 3800)
-
-    def test_offline_throughput_non_stream_small_batch_size(self):
-        res = run_bench_serving(
-            model=DEFAULT_MODEL_NAME_FOR_TEST,
-            num_prompts=200,
-            request_rate=float("inf"),
-            other_server_args=["--max-running-requests", "10"],
-            dataset_name="sharegpt",
-            random_input_len=None,
-            random_output_len=None,
-            disable_stream=True,
-            need_warmup=True,
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_offline_throughput_non_stream_small_batch_size\n"
-                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
-            )
-            if is_in_amd_ci():
-                self.assertGreater(res["output_throughput"], 1000)
-            else:
-                self.assertGreater(res["output_throughput"], 1050)
-
-    def test_offline_throughput_without_radix_cache(self):
-        res = run_bench_serving(
-            model=DEFAULT_MODEL_NAME_FOR_TEST,
-            num_prompts=500,
-            request_rate=float("inf"),
-            other_server_args=["--disable-radix-cache"],
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_offline_throughput_without_radix_cache\n"
-                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
-            )
-            if is_in_amd_ci():
-                self.assertGreater(res["output_throughput"], 3050)
-            else:
-                self.assertGreater(res["output_throughput"], 3800)
-
-    def test_offline_throughput_without_chunked_prefill(self):
-        res = run_bench_serving(
-            model=DEFAULT_MODEL_NAME_FOR_TEST,
-            num_prompts=500,
-            request_rate=float("inf"),
-            other_server_args=["--chunked-prefill-size", "-1"],
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_offline_throughput_without_chunked_prefill\n"
-                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
-            )
-            self.assertGreater(res["output_throughput"], 2600)
-
-    def test_offline_throughput_with_triton_attention_backend(self):
-        res = run_bench_serving(
-            model=DEFAULT_MODEL_NAME_FOR_TEST,
-            num_prompts=500,
-            request_rate=float("inf"),
-            other_server_args=[
-                "--attention-backend",
-                "triton",
-                "--context-length",
-                "8192",
-            ],
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_offline_throughput_with_triton_attention_backend\n"
-                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
-            )
-            if is_in_amd_ci():
-                self.assertGreater(res["output_throughput"], 3500)
-            else:
-                self.assertGreater(res["output_throughput"], 3700)
-
-    def test_offline_throughput_default_fp8(self):
-        res = run_bench_serving(
-            model=DEFAULT_MODEL_NAME_FOR_TEST_FP8,
-            num_prompts=500,
-            request_rate=float("inf"),
-            other_server_args=[],
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_offline_throughput_default_fp8\n"
-                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
-            )
-            if is_in_amd_ci():
-                self.assertGreater(res["output_throughput"], 3500)
-            else:
-                self.assertGreater(res["output_throughput"], 4300)
-
-    def test_online_latency_default(self):
-        res = run_bench_serving(
-            model=DEFAULT_MODEL_NAME_FOR_TEST,
-            num_prompts=100,
-            request_rate=1,
-            other_server_args=[],
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_online_latency_default\n"
-                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
-            )
-            self.assertLess(res["median_e2e_latency_ms"], 11000)
-            if is_in_amd_ci():
-                self.assertLess(res["median_ttft_ms"], 115)
-            else:
-                self.assertLess(res["median_ttft_ms"], 86)
-            self.assertLess(res["median_itl_ms"], 10)
-
-    def test_vlm_offline_throughput(self):
-        res = run_bench_serving(
-            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
-            num_prompts=200,
-            request_rate=float("inf"),
-            other_server_args=[
-                "--mem-fraction-static",
-                "0.7",
-            ],
-            dataset_name="mmmu",
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_vlm_offline_throughput\n"
-                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
-            )
-            if is_in_amd_ci():
-                self.assertGreater(res["output_throughput"], 2000)
-                # TODO: not set yet, need AMD machine
-            else:
-                self.assertGreater(res["output_throughput"], 2500)
-
-    def test_vlm_online_latency(self):
-        res = run_bench_serving(
-            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
-            num_prompts=250,
-            request_rate=1,
-            other_server_args=[
-                "--mem-fraction-static",
-                "0.7",
-            ],
-            dataset_name="mmmu",
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_vlm_online_latency\n"
-                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
-            )
-            self.assertLess(res["median_e2e_latency_ms"], 16500)
-            if is_in_amd_ci():
-                self.assertLess(res["median_ttft_ms"], 150)
-                # TODO: not set yet, need AMD machine
-            else:
-                self.assertLess(res["median_ttft_ms"], 100)
-            self.assertLess(res["median_itl_ms"], 8)
-
-    def test_lora_online_latency(self):
-        # TODO (lifuhuang): verify LoRA support in AMD.
-        if is_in_amd_ci():
-            pass
-
-        res = self._run_lora_latency_test(enable_background_task=False)
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_lora_online_latency\n"
-                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
-                f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
-            )
-            self.assertLess(res["median_e2e_latency_ms"], 2400)
-            self.assertLess(res["median_ttft_ms"], 58)
-
-    def test_lora_online_latency_with_concurrent_adapter_updates(self):
-        # TODO (lifuhuang): verify LoRA support in AMD.
-        if is_in_amd_ci():
-            pass
-
-        res = self._run_lora_latency_test(enable_background_task=True)
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_lora_online_latency\n"
-                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
-                f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
-            )
-            self.assertLess(res["median_e2e_latency_ms"], 4000)
-            self.assertLess(res["median_ttft_ms"], 80)
-
-    def _run_lora_latency_test(self, enable_background_task: bool):
-        """
-        Run a latency test for LoRA with the specified background task setting.
-        """
-
-        async def lora_loader_unloader_task(
-            base_url: str,
-            start_event: asyncio.Event,
-            stop_event: asyncio.Event,
-        ):
-            """
-            A background task that repeatedly loads and unloads a LoRA adapter.
-            """
-            await start_event.wait()
-
-            path_cycler = itertools.cycle(
-                [
-                    "pbevan11/llama-3.1-8b-ocr-correction",
-                    "faridlazuarda/valadapt-llama-3.1-8B-it-chinese",
-                    "philschmid/code-llama-3-1-8b-text-to-sql-lora",
-                ]
-            )
-            load_url = f"{base_url}/load_lora_adapter"
-            unload_url = f"{base_url}/unload_lora_adapter"
-            num_updates = 0
-
-            while not stop_event.is_set():
-                # 1. Load the LoRA adapter
-                lora_path = next(path_cycler)
-                response = await asyncio.to_thread(
-                    requests.post,
-                    load_url,
-                    json={"lora_name": lora_path, "lora_path": lora_path},
-                )
-                self.assertTrue(
-                    response.ok, f"Failed to load LoRA adapter: {response.text}"
-                )
-                num_updates += 1
-
-                if stop_event.is_set():
-                    break
-
-                # Yield control to allow other tasks to run.
-                await asyncio.sleep(1)
-
-                # 2. Unload the LoRA adapter
-                response = await asyncio.to_thread(
-                    requests.post,
-                    unload_url,
-                    json={"lora_name": lora_path},
-                )
-                self.assertTrue(
-                    response.ok, f"Failed to unload LoRA adapter: {response.text}"
-                )
-                num_updates += 1
-
-                # Yield control to allow other tasks to run.
-                await asyncio.sleep(1)
-
-        background_task = lora_loader_unloader_task if enable_background_task else None
-        res = run_bench_serving(
-            model=DEFAULT_MODEL_NAME_FOR_TEST,
-            num_prompts=400,
-            request_rate=8,
-            other_server_args=[
-                "--enable-lora",
-                "--max-loras-per-batch",
-                "1",
-                "--disable-radix-cache",
-                "--random-seed",
-                "42",
-                "--mem-fraction-static",
-                "0.8",
-                "--lora-paths",
-                "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
-                "--max-lora-rank",
-                "256",
-            ],
-            dataset_name="random",
-            random_input_len=256,
-            random_output_len=256,
-            lora_name=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"],
-            background_task=background_task,
-        )
-
-        return res
-
-    def test_online_latency_eagle(self):
-        res = run_bench_serving(
-            model=DEFAULT_TARGET_MODEL_EAGLE,
-            num_prompts=300,
-            request_rate=8,
-            sharegpt_context_len=3072,
-            disable_ignore_eos=True,
-            dataset_name="sharegpt",
-            other_server_args=[
-                "--speculative-algorithm",
-                "EAGLE",
-                "--speculative-draft-model-path",
-                DEFAULT_DRAFT_MODEL_EAGLE,
-                "--speculative-num-steps",
-                "5",
-                "--speculative-eagle-topk",
-                "4",
-                "--speculative-num-draft-tokens",
-                "16",
-                "--mem-fraction-static",
-                "0.7",
-            ],
-            need_warmup=True,
-            seed=42,
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_online_latency_eagle\n"
-                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
-                f"accept_length: {res['accept_length']:.2f} \n"
-            )
-            if is_in_amd_ci():
-                self.assertLess(res["median_e2e_latency_ms"], 1800)
-            else:
-                self.assertLess(res["median_e2e_latency_ms"], 900)
-            self.assertGreater(res["accept_length"], 3.0)
-
-    def test_moe_offline_throughput_default(self):
-        res = run_bench_serving(
-            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
-            num_prompts=300,
-            request_rate=float("inf"),
-            other_server_args=["--tp", "2"],
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_moe_offline_throughput_default\n"
-                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
-            )
-            if is_in_amd_ci():
-                self.assertGreater(res["output_throughput"], 2100)
-            else:
-                self.assertGreater(res["output_throughput"], 2200)
-
-    def test_moe_offline_throughput_without_radix_cache(self):
-        res = run_bench_serving(
-            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
-            num_prompts=300,
-            request_rate=float("inf"),
-            other_server_args=["--tp", "2", "--disable-radix-cache"],
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_moe_offline_throughput_without_radix_cache\n"
-                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
-            )
-            if is_in_amd_ci():
-                self.assertGreater(res["output_throughput"], 2100)
-            else:
-                self.assertGreater(res["output_throughput"], 2200)
-
-    def test_pp_offline_throughput_default_decode(self):
-        res = run_bench_serving(
-            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
-            num_prompts=1000,
-            request_rate=float("inf"),
-            random_input_len=1,
-            random_output_len=1024,
-            other_server_args=["--pp-size", "2"],
-            need_warmup=True,
-            seed=42,
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_pp_offline_throughput_default_decode\n"
-                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
-            )
-            self.assertGreater(res["output_throughput"], 6700)
-
-    def test_pp_long_context_prefill(self):
-        res = run_bench_serving(
-            model="meta-llama/Llama-3.3-70B-Instruct",
-            num_prompts=4,
-            request_rate=float("inf"),
-            random_input_len=128000,
-            random_output_len=1,
-            dataset_name="random",
-            other_server_args=[
-                "--quantization",
-                "fp8",
-                "--pp-size",
-                "2",
-            ]
-            + (["--mem-fraction-static", "0.7"] if is_in_amd_ci() else []),
-            need_warmup=False,
-            seed=42,
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_pp_long_context_latency_prefill\n"
-                f"input_throughput: {res['input_throughput']:.2f} ms\n"
-            )
-            if is_in_amd_ci():
-                self.assertGreater(res["input_throughput"], 3000)
-            else:
-                self.assertGreater(res["input_throughput"], 4000)
-
-    def test_score_api_latency_throughput(self):
-        """Test score API latency and throughput performance"""
-        res = run_score_benchmark(
-            model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
-            num_requests=1000,
-            batch_size=10,
-            other_server_args=[],
-            need_warmup=True,
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_score_api_throughput\n"
-                f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
-                f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
-                f"Score API throughput: {res['throughput']:.2f} req/s\n"
-                f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
-            )
-
-        self.assertEqual(res["successful_requests"], res["total_requests"])
-        self.assertLess(res["avg_latency_ms"], 48)
-        self.assertLess(res["p95_latency_ms"], 50)
-        self.assertGreater(res["throughput"], 20)
-
-    def test_score_api_batch_scaling(self):
-        """Test score API performance with different batch sizes"""
-        batch_sizes = [10, 25, 50]
-
-        for batch_size in batch_sizes:
-            res = run_score_benchmark(
-                model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
-                num_requests=500,
-                batch_size=batch_size,
-            )
-
-            if is_in_ci():
-                write_github_step_summary(
-                    f"### test_score_api_batch_scaling_size_{batch_size}\n"
-                    f"Batch size: {batch_size}\n"
-                    f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
-                    f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
-                    f"Throughput: {res['throughput']:.2f} req/s\n"
-                    f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
-                )
-
-            self.assertEqual(res["successful_requests"], res["total_requests"])
-            bounds = {
-                10: (45, 50),
-                25: (50, 60),
-                50: (60, 65),
-            }
-            avg_latency_bound, p95_latency_bound = bounds.get(batch_size, (60, 65))
-            self.assertLess(res["avg_latency_ms"], avg_latency_bound)
-            self.assertLess(res["p95_latency_ms"], p95_latency_bound)
-
-    def test_embeddings_api_latency_throughput(self):
-        """Test embeddings API latency and throughput performance"""
-        res = run_embeddings_benchmark(
-            model=DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
-            num_requests=1000,
-            batch_size=1,
-            input_tokens=500,
-            other_server_args=[],
-            need_warmup=True,
-        )
-
-        if is_in_ci():
-            write_github_step_summary(
-                f"### test_embeddings_api_throughput\n"
-                f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
-                f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
-                f"Embeddings API throughput: {res['throughput']:.2f} req/s\n"
-                f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
-            )
-
-        self.assertEqual(res["successful_requests"], res["total_requests"])
-        # Bounds based on actual performance on 1xH100: avg=15ms, p95=15ms, throughput=67req/s
-        self.assertLess(res["avg_latency_ms"], 20)
-        self.assertLess(res["p95_latency_ms"], 25)
-        self.assertGreater(res["throughput"], 60)
-
-    def test_embeddings_api_batch_scaling(self):
-        """Test embeddings API performance with different batch sizes"""
-        batch_sizes = [10, 25, 50]
-
-        for batch_size in batch_sizes:
-            res = run_embeddings_benchmark(
-                model=DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
-                num_requests=500,
-                batch_size=batch_size,
-                input_tokens=500,
-            )
-
-            if is_in_ci():
-                write_github_step_summary(
-                    f"### test_embeddings_api_batch_scaling_size_{batch_size}\n"
-                    f"Batch size: {batch_size}\n"
-                    f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
-                    f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
-                    f"Throughput: {res['throughput']:.2f} req/s\n"
-                    f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
-                )
-
-            self.assertEqual(res["successful_requests"], res["total_requests"])
-            bounds = {
-                10: (60, 65),
-                25: (115, 120),
-                50: (190, 195),
-            }
-            avg_latency_bound, p95_latency_bound = bounds.get(batch_size, (250, 250))
-            self.assertLess(res["avg_latency_ms"], avg_latency_bound)
-            self.assertLess(res["p95_latency_ms"], p95_latency_bound)
-
-
-if __name__ == "__main__":
-    unittest.main()