diff --git a/.github/workflows/nightly-test-amd-rocm720.yml b/.github/workflows/nightly-test-amd-rocm720.yml
index bc6328a2a2db..80d6a1afa324 100644
--- a/.github/workflows/nightly-test-amd-rocm720.yml
+++ b/.github/workflows/nightly-test-amd-rocm720.yml
@@ -21,46 +21,10 @@ on:
         type: boolean
         default: true
       job_filter:
-        description: 'Select which job to run (leave empty or "all" to run all jobs)'
+        description: 'Comma-separated list of jobs to run (e.g. "nightly-8-gpu-grok2-rocm720,nightly-8-gpu-deepseek-v31-rocm720"). Leave empty or "all" to run all jobs.'
         required: false
-        type: choice
+        type: string
         default: 'all'
-        options:
-          - 'all'
-          # MI30x ROCm 7.2 Unit Tests
-          - 'nightly-test-1-gpu-unit-rocm720'
-          # MI30x ROCm 7.2 Accuracy Tests (GSM8K / MMMU)
-          - 'nightly-accuracy-2-gpu-rocm720'
-          - 'nightly-accuracy-2-gpu-vlm-rocm720'
-          - 'nightly-perf-2-gpu-text-rocm720'
-          - 'nightly-perf-2-gpu-vlm-rocm720'
-          - 'nightly-accuracy-8-gpu-rocm720'
-          # MI30x ROCm 7.2 Accuracy + Performance Tests (combined)
-          - 'nightly-8-gpu-grok1-int4-rocm720'
-          - 'nightly-8-gpu-grok2-rocm720'
-          - 'nightly-8-gpu-deepseek-v31-rocm720'
-          - 'nightly-8-gpu-deepseek-v32-rocm720'
-          - 'nightly-8-gpu-deepseek-v32-mtp-rocm720'
-          - 'nightly-8-gpu-kimi-k25-rocm720'
-          - 'nightly-8-gpu-qwen3-235b-rocm720'
-          - 'nightly-8-gpu-qwen35-rocm720'
-          - 'nightly-8-gpu-glm5-rocm720'
-          - 'nightly-8-gpu-minimax-m25-rocm720'
-          # MI35x ROCm 7.2 jobs
-          - 'nightly-test-1-gpu-mi35x-rocm720'
-          - 'nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720'
-          - 'nightly-8-gpu-mi35x-qwen35-rocm720'
-          - 'nightly-accuracy-8-gpu-mi35x-rocm720'
-          - 'nightly-8-gpu-mi35x-grok1-int4-rocm720'
-          - 'nightly-8-gpu-mi35x-grok2-rocm720'
-          - 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720'
-          - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720'
-          - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720'
-          - 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720'
-          - 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720'
-          - 'nightly-8-gpu-mi35x-kimi-k25-rocm720'
-          - 'nightly-8-gpu-mi35x-glm5-rocm720'
-          - 'nightly-8-gpu-mi35x-minimax-m25-rocm720'
   workflow_call:
     inputs:
       ref:
@@ -98,7 +62,7 @@ jobs:
   # ============================================== MI30x ROCm 7.2 Unit Tests ==============================================
   # 1-GPU Unit Tests - LoRA, debug utils, scheduler, etc. (MI30x ROCm 7.2)
   nightly-test-1-gpu-unit-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-unit-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-test-1-gpu-unit-rocm720,'))
     runs-on: linux-mi325-1gpu-sglang
     steps:
       - name: Checkout code
@@ -127,7 +91,7 @@ jobs:
   # ============================================== MI30x ROCm 7.2 Accuracy Tests ==============================================
   # 2-GPU Accuracy Tests - GSM8K eval (MI30x ROCm 7.2)
   nightly-accuracy-2-gpu-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-2-gpu-rocm720,'))
     runs-on: linux-mi325-2gpu-sglang
     steps:
       - name: Checkout code
@@ -155,7 +119,7 @@ jobs:
 
   # 2-GPU VLM Accuracy Tests - Vision-Language Models MMMU evaluation (ROCm 7.2)
   nightly-accuracy-2-gpu-vlm-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-vlm-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-2-gpu-vlm-rocm720,'))
     runs-on: linux-mi325-2gpu-sglang
     steps:
       - name: Checkout code
@@ -184,7 +148,7 @@ jobs:
 
   # 2-GPU Text Models Performance Tests (ROCm 7.2)
   nightly-perf-2-gpu-text-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-text-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-2-gpu-text-rocm720,'))
     runs-on: linux-mi325-2gpu-sglang
     steps:
       - name: Checkout code
@@ -214,7 +178,7 @@ jobs:
 
   # 2-GPU VLM Performance Tests (ROCm 7.2)
   nightly-perf-2-gpu-vlm-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-vlm-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-2-gpu-vlm-rocm720,'))
     runs-on: linux-mi325-2gpu-sglang
     steps:
       - name: Checkout code
@@ -244,7 +208,7 @@ jobs:
 
   # 8-GPU Accuracy Tests - GPT-OSS, Grok1-FP8 (ROCm 7.2)
   nightly-accuracy-8-gpu-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -284,7 +248,7 @@ jobs:
   # ============================================== MI30x ROCm 7.2 Combined Accuracy + Performance Tests ==============================================
   # 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2
   nightly-8-gpu-grok1-int4-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok1-int4-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-grok1-int4-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -327,7 +291,7 @@ jobs:
 
   # 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2
   nightly-8-gpu-grok2-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok2-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-grok2-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -370,7 +334,7 @@ jobs:
 
   # 8-GPU DeepSeek-V3.1 (Accuracy + Performance) ROCm 7.2
   nightly-8-gpu-deepseek-v31-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v31-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -413,7 +377,7 @@ jobs:
 
   # 8-GPU DeepSeek-V3.2 (Basic Accuracy + Perf) ROCm 7.2
   nightly-8-gpu-deepseek-v32-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v32-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -454,7 +418,7 @@ jobs:
 
   # 8-GPU DeepSeek-V3.2 MTP (MTP Accuracy + Perf) ROCm 7.2
   nightly-8-gpu-deepseek-v32-mtp-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32-mtp-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v32-mtp-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -493,9 +457,39 @@ jobs:
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
+  # 8-GPU DeepSeek-V3 KV FP8 (Basic + MTP with --kv-cache-dtype fp8_e4m3) ROCm 7.2
+  nightly-8-gpu-deepseek-v3-kv-fp8-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v3-kv-fp8-rocm720,'))
+    runs-on: linux-mi325-8gpu-sglang
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps
+
+      - name: DeepSeek-V3 KV FP8 Test ROCm 7.2 (8-GPU Basic + MTP)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-deepseek-v3-kv-fp8 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
   # 8-GPU Kimi-K2.5 (Accuracy) ROCm 7.2
   nightly-8-gpu-kimi-k25-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-kimi-k25-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-kimi-k25-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -525,7 +519,7 @@ jobs:
 
   # 8-GPU Qwen3-235B (Accuracy + Performance) ROCm 7.2
   nightly-8-gpu-qwen3-235b-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-qwen3-235b-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-qwen3-235b-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -555,7 +549,7 @@ jobs:
 
   # 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2
   nightly-8-gpu-qwen35-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-qwen35-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-qwen35-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -587,7 +581,7 @@ jobs:
 
   # 8-GPU GLM-5 (Accuracy) ROCm 7.2
   nightly-8-gpu-glm5-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-glm5-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-glm5-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -620,7 +614,7 @@ jobs:
 
   # 8-GPU MiniMax-M2.5 (Accuracy) ROCm 7.2
   nightly-8-gpu-minimax-m25-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-minimax-m25-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-minimax-m25-rocm720,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -652,7 +646,7 @@ jobs:
   # ============================================== MI35x ROCm 7.2 Tests ==============================================
   # MI35x 1-GPU ROCm 7.2 tests
   nightly-test-1-gpu-mi35x-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-mi35x-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-test-1-gpu-mi35x-rocm720,'))
     runs-on: linux-mi35x-gpu-1
     steps:
       - name: Checkout code
@@ -681,7 +675,7 @@ jobs:
 
   # MI35x 8-GPU Accuracy Tests - GPT-OSS (ROCm 7.2)
   nightly-accuracy-8-gpu-mi35x-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -713,7 +707,7 @@ jobs:
 
   # MI35x 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2
   nightly-8-gpu-mi35x-grok1-int4-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok1-int4-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-grok1-int4-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -759,7 +753,7 @@ jobs:
 
   # MI35x 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2
   nightly-8-gpu-mi35x-grok2-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok2-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-grok2-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -805,7 +799,7 @@ jobs:
 
   # MI35x 8-GPU DeepSeek-R1-MXFP4 (Accuracy + Performance) ROCm 7.2
   nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -847,9 +841,97 @@ jobs:
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
+  # MI35x 8-GPU DeepSeek-R1-MXFP4 KV FP8 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720,'))
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 KV FP8)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 KV FP8)
+        timeout-minutes: 300
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720,'))
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion)
+        timeout-minutes: 300
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
   # MI35x 8-GPU DeepSeek-V3.2 Accuracy Test (ROCm 7.2)
   nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -882,7 +964,7 @@ jobs:
 
   # MI35x 8-GPU DeepSeek-V3.2 TP+MTP Accuracy Test (ROCm 7.2)
   nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -915,7 +997,7 @@ jobs:
 
   # MI35x 8-GPU DeepSeek-V3.2 Performance Test (Basic) ROCm 7.2
   nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -948,7 +1030,7 @@ jobs:
 
   # MI35x 8-GPU Kimi-K2.5 (Accuracy) ROCm 7.2
   nightly-8-gpu-mi35x-kimi-k25-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-kimi-k25-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-kimi-k25-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -981,7 +1063,7 @@ jobs:
 
   # MI35x 8-GPU Qwen3-235B-MXFP4 (Accuracy + Performance) ROCm 7.2
   nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -1014,7 +1096,7 @@ jobs:
 
   # MI35x 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2
   nightly-8-gpu-mi35x-qwen35-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-qwen35-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-qwen35-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -1046,7 +1128,7 @@ jobs:
           exit ${TEST_EXIT_CODE:-0}
 
   nightly-8-gpu-mi35x-glm5-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-glm5-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-glm5-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -1081,7 +1163,7 @@ jobs:
 
   # MI35x 8-GPU MiniMax-M2.5 (Accuracy) ROCm 7.2
   nightly-8-gpu-mi35x-minimax-m25-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-minimax-m25-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-minimax-m25-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -1114,7 +1196,7 @@ jobs:
 
   # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP) ROCm 7.2
   nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -1163,6 +1245,7 @@ jobs:
       - nightly-8-gpu-deepseek-v31-rocm720
       - nightly-8-gpu-deepseek-v32-rocm720
       - nightly-8-gpu-deepseek-v32-mtp-rocm720
+      - nightly-8-gpu-deepseek-v3-kv-fp8-rocm720
       - nightly-8-gpu-kimi-k25-rocm720
       - nightly-8-gpu-qwen3-235b-rocm720
       - nightly-8-gpu-qwen35-rocm720
@@ -1174,6 +1257,8 @@ jobs:
       - nightly-8-gpu-mi35x-grok1-int4-rocm720
       - nightly-8-gpu-mi35x-grok2-rocm720
       - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720
+      - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720
+      - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720
       - nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720
       - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720
       - nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720
diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml
index 3dc8f1270707..54e0202ccc5a 100644
--- a/.github/workflows/nightly-test-amd.yml
+++ b/.github/workflows/nightly-test-amd.yml
@@ -21,46 +21,10 @@ on:
         type: boolean
         default: true
       job_filter:
-        description: 'Select which job to run (leave empty or "all" to run all jobs)'
+        description: 'Comma-separated list of jobs to run (e.g. "nightly-8-gpu-grok2,nightly-8-gpu-deepseek-v31"). Leave empty or "all" to run all jobs.'
         required: false
-        type: choice
+        type: string
         default: 'all'
-        options:
-          - 'all'
-          # MI30x Unit Tests
-          - 'nightly-test-1-gpu-unit'
-          # MI30x Accuracy Tests (GSM8K / MMMU)
-          - 'nightly-accuracy-2-gpu'
-          - 'nightly-accuracy-2-gpu-vlm'
-          - 'nightly-perf-2-gpu-text'
-          - 'nightly-perf-2-gpu-vlm'
-          - 'nightly-accuracy-8-gpu'
-          # MI30x Accuracy + Performance Tests (combined)
-          - 'nightly-8-gpu-grok1-int4'
-          - 'nightly-8-gpu-grok2'
-          - 'nightly-8-gpu-deepseek-v31'
-          - 'nightly-8-gpu-deepseek-v32'
-          - 'nightly-8-gpu-deepseek-v32-mtp'
-          - 'nightly-8-gpu-kimi-k25'
-          - 'nightly-8-gpu-qwen3-235b'
-          - 'nightly-8-gpu-qwen35'
-          - 'nightly-8-gpu-glm5'
-          - 'nightly-8-gpu-minimax-m25'
-          # MI35x jobs
-          - 'nightly-test-1-gpu-mi35x'
-          - 'nightly-8-gpu-mi35x-qwen3-235b-mxfp4'
-          - 'nightly-8-gpu-mi35x-qwen35'
-          - 'nightly-8-gpu-mi35x-kimi-k25'
-          - 'nightly-8-gpu-mi35x-glm5'
-          - 'nightly-8-gpu-mi35x-minimax-m25'
-          - 'nightly-accuracy-8-gpu-mi35x'
-          - 'nightly-8-gpu-mi35x-grok1-int4'
-          - 'nightly-8-gpu-mi35x-grok2'
-          - 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4'
-          - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32'
-          - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp'
-          - 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic'
-          - 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp'
   workflow_call:
     inputs:
       ref:
@@ -98,7 +62,7 @@ jobs:
   # ============================================== MI30x Unit Tests ==============================================
   # 1-GPU Unit Tests - LoRA, debug utils, scheduler, etc. (MI30x only)
   nightly-test-1-gpu-unit:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-unit')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-test-1-gpu-unit,'))
     runs-on: linux-mi325-1gpu-sglang
     steps:
       - name: Checkout code
@@ -128,7 +92,7 @@ jobs:
   # ============================================== MI30x Accuracy Tests ==============================================
   # 2-GPU Accuracy Tests - GSM8K eval (MI30x only)
   nightly-accuracy-2-gpu:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-2-gpu,'))
     runs-on: linux-mi325-2gpu-sglang
     steps:
       - name: Checkout code
@@ -157,7 +121,7 @@ jobs:
 
   # 2-GPU VLM Accuracy Tests - Vision-Language Models MMMU evaluation
   nightly-accuracy-2-gpu-vlm:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-vlm')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-2-gpu-vlm,'))
     runs-on: linux-mi325-2gpu-sglang
     steps:
       - name: Checkout code
@@ -187,7 +151,7 @@ jobs:
 
   # 2-GPU Text Models Performance Tests
   nightly-perf-2-gpu-text:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-text')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-2-gpu-text,'))
     runs-on: linux-mi325-2gpu-sglang
     steps:
       - name: Checkout code
@@ -218,7 +182,7 @@ jobs:
 
   # 2-GPU VLM Performance Tests
   nightly-perf-2-gpu-vlm:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-vlm')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-2-gpu-vlm,'))
     runs-on: linux-mi325-2gpu-sglang
     steps:
       - name: Checkout code
@@ -249,7 +213,7 @@ jobs:
 
   # 8-GPU Accuracy Tests - GPT-OSS, Grok1-FP8 (accuracy only)
   nightly-accuracy-8-gpu:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -289,7 +253,7 @@ jobs:
   # ============================================== MI30x Combined Accuracy + Performance Tests ==============================================
   # 8-GPU Grok1-INT4 (Accuracy + Performance combined)
   nightly-8-gpu-grok1-int4:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok1-int4')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-grok1-int4,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -332,7 +296,7 @@ jobs:
 
   # 8-GPU Grok2 (Accuracy + Performance combined)
   nightly-8-gpu-grok2:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok2')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-grok2,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -375,7 +339,7 @@ jobs:
 
   # 8-GPU DeepSeek-V3.1 (Accuracy + Performance combined)
   nightly-8-gpu-deepseek-v31:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v31,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -418,7 +382,7 @@ jobs:
 
   # 8-GPU DeepSeek-V3.2 (Basic Accuracy + Perf)
   nightly-8-gpu-deepseek-v32:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v32,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -459,7 +423,7 @@ jobs:
 
   # 8-GPU DeepSeek-V3.2 MTP (MTP Accuracy + Perf)
   nightly-8-gpu-deepseek-v32-mtp:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32-mtp')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v32-mtp,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -498,9 +462,39 @@ jobs:
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
+  # 8-GPU DeepSeek-V3 KV FP8 (Basic + MTP with --kv-cache-dtype fp8_e4m3)
+  nightly-8-gpu-deepseek-v3-kv-fp8:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v3-kv-fp8,'))
+    runs-on: linux-mi325-8gpu-sglang
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh
+
+      - name: DeepSeek-V3 KV FP8 Test (8-GPU Basic + MTP)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-deepseek-v3-kv-fp8 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
   # 8-GPU Kimi-K2.5 (Accuracy)
   nightly-8-gpu-kimi-k25:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-kimi-k25')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-kimi-k25,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -529,7 +523,7 @@ jobs:
           exit ${TEST_EXIT_CODE:-0}
 
   nightly-8-gpu-qwen3-235b:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-qwen3-235b')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-qwen3-235b,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -559,7 +553,7 @@ jobs:
 
   # 8-GPU Qwen 3.5 (Accuracy)
   nightly-8-gpu-qwen35:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-qwen35')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-qwen35,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -590,7 +584,7 @@ jobs:
           exit ${TEST_EXIT_CODE:-0}
 
   nightly-8-gpu-glm5:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-glm5')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-glm5,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -623,7 +617,7 @@ jobs:
 
   # 8-GPU MiniMax-M2.5 (Accuracy)
   nightly-8-gpu-minimax-m25:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-minimax-m25')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-minimax-m25,'))
     runs-on: linux-mi325-8gpu-sglang
     steps:
       - name: Checkout code
@@ -655,7 +649,7 @@ jobs:
   # ============================================== MI35x Tests ==============================================
   # MI35x 1-GPU tests - platform-agnostic tests that may work on CDNA4 (gfx950)
   nightly-test-1-gpu-mi35x:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-mi35x')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-test-1-gpu-mi35x,'))
     runs-on: linux-mi35x-gpu-1
     steps:
       - name: Checkout code
@@ -687,7 +681,7 @@ jobs:
 
   # MI35x 8-GPU Accuracy Tests - GPT-OSS (accuracy only)
   nightly-accuracy-8-gpu-mi35x:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -719,7 +713,7 @@ jobs:
 
   # MI35x 8-GPU Grok1-INT4 (Accuracy + Performance combined)
   nightly-8-gpu-mi35x-grok1-int4:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok1-int4')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-grok1-int4,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -765,7 +759,7 @@ jobs:
 
   # MI35x 8-GPU Grok2 (Accuracy + Performance combined)
   nightly-8-gpu-mi35x-grok2:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok2')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-grok2,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -811,7 +805,7 @@ jobs:
 
   # MI35x 8-GPU DeepSeek-R1-MXFP4 (Accuracy + Performance combined)
   nightly-8-gpu-mi35x-deepseek-r1-mxfp4:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -853,9 +847,97 @@ jobs:
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
+  # MI35x 8-GPU DeepSeek-R1-MXFP4 KV FP8 (Accuracy + Performance combined)
+  nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8,'))
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x (8-GPU DeepSeek-R1-MXFP4 KV FP8)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test MI35x (8-GPU DeepSeek-R1-MXFP4 KV FP8)
+        timeout-minutes: 300
+        continue-on-error: true  # Perf test failure doesn't fail the job if accuracy passed
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion (Accuracy + Performance combined)
+  nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion,'))
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test MI35x (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion)
+        timeout-minutes: 300
+        continue-on-error: true  # Perf test failure doesn't fail the job if accuracy passed
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
   # MI35x 8-GPU DeepSeek-V3.2 Accuracy Test
   nightly-accuracy-8-gpu-mi35x-deepseek-v32:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -888,7 +970,7 @@ jobs:
 
   # MI35x 8-GPU DeepSeek-V3.2 TP+MTP Accuracy Test
   nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -921,7 +1003,7 @@ jobs:
 
   # MI35x 8-GPU DeepSeek-V3.2 Performance Test (Basic)
   nightly-perf-8-gpu-mi35x-deepseek-v32-basic:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-8-gpu-mi35x-deepseek-v32-basic,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -954,7 +1036,7 @@ jobs:
 
   # MI35x 8-GPU Kimi-K2.5 (Accuracy)
   nightly-8-gpu-mi35x-kimi-k25:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-kimi-k25')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-kimi-k25,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -987,7 +1069,7 @@ jobs:
 
   # MI35x 8-GPU Qwen3-235B-MXFP4 (Accuracy + Performance)
   nightly-8-gpu-mi35x-qwen3-235b-mxfp4:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-qwen3-235b-mxfp4')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-qwen3-235b-mxfp4,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -1020,7 +1102,7 @@ jobs:
 
   # MI35x 8-GPU Qwen 3.5 (Accuracy)
   nightly-8-gpu-mi35x-qwen35:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-qwen35')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-qwen35,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -1052,7 +1134,7 @@ jobs:
           exit ${TEST_EXIT_CODE:-0}
 
   nightly-8-gpu-mi35x-glm5:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-glm5')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-glm5,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -1087,7 +1169,7 @@ jobs:
 
   # MI35x 8-GPU MiniMax-M2.5 (Accuracy)
   nightly-8-gpu-mi35x-minimax-m25:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-minimax-m25')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-minimax-m25,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -1120,7 +1202,7 @@ jobs:
 
   # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP)
   nightly-perf-8-gpu-mi35x-deepseek-v32-mtp:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp')
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-8-gpu-mi35x-deepseek-v32-mtp,'))
     runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
@@ -1169,6 +1251,7 @@ jobs:
       - nightly-8-gpu-deepseek-v31
       - nightly-8-gpu-deepseek-v32
       - nightly-8-gpu-deepseek-v32-mtp
+      - nightly-8-gpu-deepseek-v3-kv-fp8
       - nightly-8-gpu-kimi-k25
       - nightly-8-gpu-qwen3-235b
       - nightly-8-gpu-qwen35
@@ -1180,6 +1263,8 @@ jobs:
       - nightly-8-gpu-mi35x-grok1-int4
       - nightly-8-gpu-mi35x-grok2
       - nightly-8-gpu-mi35x-deepseek-r1-mxfp4
+      - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8
+      - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion
       - nightly-accuracy-8-gpu-mi35x-deepseek-v32
       - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp
       - nightly-8-gpu-mi35x-kimi-k25
diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml
index b566e1d84f5d..c8e848b99120 100644
--- a/.github/workflows/pr-test-amd-rocm720.yml
+++ b/.github/workflows/pr-test-amd-rocm720.yml
@@ -28,7 +28,7 @@ on:
   workflow_dispatch:
     inputs:
       target_stage:
-        description: "Specific stage to run (optional, for quick testing)"
+        description: "Specific stage(s) to run, comma-separated (e.g. 'stage-a-test-1-amd,stage-b-test-small-1-gpu-amd')"
         required: false
         type: string
         default: ""
@@ -144,7 +144,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'sgl-kernel-unit-test-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',sgl-kernel-unit-test-amd,')) ||
         (
           !inputs.target_stage &&
           needs.check-changes.outputs.sgl_kernel == 'true'
@@ -190,7 +190,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'sgl-kernel-unit-test-2-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',sgl-kernel-unit-test-2-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           needs.check-changes.outputs.sgl_kernel == 'true'
@@ -231,7 +231,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-a-test-1-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-a-test-1-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -270,7 +270,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'jit-kernel-unit-test-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',jit-kernel-unit-test-amd,')) ||
         (
           !inputs.target_stage &&
           needs.check-changes.outputs.jit_kernel == 'true'
@@ -308,7 +308,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -319,7 +319,7 @@ jobs:
       fail-fast: false
       matrix:
         runner: [linux-mi325-1gpu-sglang]
-        part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+        part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -340,14 +340,14 @@ jobs:
       - name: Run test
         timeout-minutes: 30
         run: |
-          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
 
   stage-b-test-small-1-gpu-amd-nondeterministic:
     needs: [check-changes]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-nondeterministic') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd-nondeterministic,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -385,7 +385,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-mi35x') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd-mi35x,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -423,7 +423,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-large-1-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-large-1-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -462,7 +462,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-large-2-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-large-2-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -501,7 +501,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'multimodal-gen-test-1-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',multimodal-gen-test-1-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -631,7 +631,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'multimodal-gen-test-2-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',multimodal-gen-test-2-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -760,7 +760,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-c-test-large-8-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-c-test-large-8-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -807,7 +807,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-c-test-large-8-gpu-amd-mi35x') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-c-test-large-8-gpu-amd-mi35x,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -841,6 +841,118 @@ jobs:
         run: |
           bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
 
+  # =============================================== Disaggregation ====================================================
+  stage-b-test-large-8-gpu-35x-disaggregation-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-large-8-gpu-disaggregation-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi35x-gpu-8.fabric]
+
+    runs-on: ${{matrix.runner}}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Check Host RDMA Environment
+        id: rdma_detect
+        run: |
+          set +e
+          echo "=== Checking Host RDMA Environment ==="
+
+          echo ""
+          echo "=== 1. Ionic driver library check ==="
+          ls -l /usr/lib/x86_64-linux-gnu/libibverbs/libionic* 2>/dev/null || echo "libionic not found in standard path"
+
+          echo ""
+          echo "=== 2. Infiniband devices ==="
+          ls -la /dev/infiniband/ 2>/dev/null || echo "/dev/infiniband not found"
+          ls -la /sys/class/infiniband/ 2>/dev/null || echo "/sys/class/infiniband not found"
+
+          echo ""
+          echo "=== 3. ibv_devinfo ==="
+          which ibv_devinfo 2>/dev/null && ibv_devinfo 2>&1 || echo "ibv_devinfo not available"
+
+          echo ""
+          echo "=== 4. Kernel modules ==="
+          lsmod 2>/dev/null | grep -E "ib_|rdma|ionic" || echo "No RDMA kernel modules loaded"
+
+          echo ""
+          echo "=== 5. Detect RDMA Devices for test environment ==="
+          if [ -d "/sys/class/infiniband" ]; then
+            RDMA_DEVS=$(ls /sys/class/infiniband | paste -sd "," -)
+            echo "Detected RDMA Devices: $RDMA_DEVS"
+            echo "SGLANG_TEST_RDMA_DEVICE=$RDMA_DEVS" >> $GITHUB_ENV
+          else
+            echo "No RDMA devices found in /sys/class/infiniband"
+            echo "SGLANG_TEST_RDMA_DEVICE=" >> $GITHUB_ENV
+          fi
+
+          echo ""
+          echo "=== Host RDMA Check Complete ==="
+
+      - name: Start Special Container
+        run: bash scripts/ci/amd/amd_ci_start_container_disagg.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh
+
+      - name: Verify RDMA in Container
+        run: |
+          docker exec -u root ci_sglang bash -c '
+            echo "=== Container RDMA Verification ==="
+            echo "Device nodes:"
+            ls -la /dev/infiniband/
+            echo ""
+            echo "Provider libraries:"
+            ls /usr/lib/x86_64-linux-gnu/libibverbs/ | grep -E "ionic|mlx" || echo "No Ionic/Mellanox providers"
+            echo ""
+            echo "HCA devices:"
+            HCA_COUNT=$(ibv_devinfo -list 2>&1 | grep -oE "^[0-9]+ HCAs? found" | grep -oE "^[0-9]+" || echo "0")
+            ibv_devinfo -list
+            if [ "$HCA_COUNT" -gt 0 ]; then
+              echo ""
+              echo "=== SUCCESS: RDMA setup complete. Found $HCA_COUNT HCA(s) ==="
+            else
+              echo ""
+              echo "=== WARNING: No HCAs detected. RDMA tests may fail ==="
+            fi
+          '
+
+      - name: Run Aiter Op Test (RMSNorm)
+        timeout-minutes: 10
+        run: |
+          echo "Running pre-check: test_rmsnorm2d.py"
+          docker exec \
+            -e MAX_JOBS=192 \
+            ci_sglang \
+            python /sgl-workspace/aiter/op_tests/test_rmsnorm2d.py
+
+      - name: Run test_disaggregation
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh \
+            -e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \
+            -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }}
+
   pr-test-amd-finish:
     needs:
       [
@@ -859,6 +971,7 @@ jobs:
         stage-b-test-small-1-gpu-amd-mi35x,
         stage-b-test-large-1-gpu-amd,
         stage-b-test-large-2-gpu-amd,
+        stage-b-test-large-8-gpu-35x-disaggregation-amd,
         stage-c-test-large-8-gpu-amd,
         stage-c-test-large-8-gpu-amd-mi35x,
       ]
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index a803a1ed1c45..415042b473e6 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -25,7 +25,7 @@ on:
   workflow_dispatch:
     inputs:
       target_stage:
-        description: "Specific stage to run (optional, for quick testing)"
+        description: "Specific stage(s) to run, comma-separated (e.g. 'stage-a-test-1-amd,stage-b-test-small-1-gpu-amd')"
         required: false
         type: string
         default: ""
@@ -141,7 +141,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'sgl-kernel-unit-test-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',sgl-kernel-unit-test-amd,')) ||
         (
           !inputs.target_stage &&
           needs.check-changes.outputs.sgl_kernel == 'true'
@@ -188,7 +188,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'sgl-kernel-unit-test-2-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',sgl-kernel-unit-test-2-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           needs.check-changes.outputs.sgl_kernel == 'true'
@@ -230,7 +230,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-a-test-1-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-a-test-1-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -270,7 +270,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'jit-kernel-unit-test-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',jit-kernel-unit-test-amd,')) ||
         (
           !inputs.target_stage &&
           needs.check-changes.outputs.jit_kernel == 'true'
@@ -309,7 +309,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -349,7 +349,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-nondeterministic') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd-nondeterministic,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -388,7 +388,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-mi35x') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd-mi35x,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -427,7 +427,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-large-1-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-large-1-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -467,7 +467,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-large-2-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-large-2-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -504,7 +504,15 @@ jobs:
 
   multimodal-gen-test-1-gpu-amd:
     needs: [check-changes]
-    if: needs.check-changes.outputs.multimodal_gen == 'true'
+    if: |
+      always() &&
+      (
+        (contains(format(',{0},', inputs.target_stage), ',multimodal-gen-test-1-gpu-amd,')) ||
+        (
+          !inputs.target_stage &&
+          needs.check-changes.outputs.multimodal_gen == 'true'
+        )
+      )
     strategy:
       fail-fast: false
       max-parallel: 1  # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT
@@ -624,7 +632,15 @@ jobs:
 
   multimodal-gen-test-2-gpu-amd:
     needs: [check-changes]
-    if: needs.check-changes.outputs.multimodal_gen == 'true'
+    if: |
+      always() &&
+      (
+        (contains(format(',{0},', inputs.target_stage), ',multimodal-gen-test-2-gpu-amd,')) ||
+        (
+          !inputs.target_stage &&
+          needs.check-changes.outputs.multimodal_gen == 'true'
+        )
+      )
     strategy:
       fail-fast: false
       max-parallel: 1  # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT
@@ -746,7 +762,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-c-test-large-8-gpu-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-c-test-large-8-gpu-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -794,7 +810,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-c-test-large-8-gpu-amd-mi35x') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-c-test-large-8-gpu-amd-mi35x,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -835,7 +851,7 @@ jobs:
     if: |
       always() &&
       (
-        (inputs.target_stage == 'stage-b-test-large-8-gpu-disaggregation-amd') ||
+        (contains(format(',{0},', inputs.target_stage), ',stage-b-test-large-8-gpu-disaggregation-amd,')) ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
diff --git a/scripts/ci/amd/amd_ci_start_container_disagg.sh b/scripts/ci/amd/amd_ci_start_container_disagg.sh
index ecf24f652e9f..70de85dff91e 100755
--- a/scripts/ci/amd/amd_ci_start_container_disagg.sh
+++ b/scripts/ci/amd/amd_ci_start_container_disagg.sh
@@ -32,8 +32,14 @@ while [[ $# -gt 0 ]]; do
   case $1 in
     --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;;
     --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;;
+    --rocm-version)
+      ROCM_VERSION="$2"
+      MI30X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi30x"
+      MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x"
+      echo "Using ROCm version override: ${ROCM_VERSION}"
+      shift 2;;
     -h|--help)
-      echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]"
+      echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG] [--rocm-version VERSION]"
       exit 0
       ;;
     *) echo "Unknown option $1"; exit 1;;
@@ -134,12 +140,27 @@ find_latest_image() {
   fi
 
   echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
-  echo "Using hard-coded fallback…" >&2
-  if [[ "${gpu_arch}" == "mi35x" ]]; then
-    echo "rocm/sgl-dev:v0.5.5-rocm700-mi35x-20251110"
-  else
-    echo "rocm/sgl-dev:v0.5.5-rocm700-mi30x-20251110"
-  fi
+  echo "Using hard-coded fallback for ${ROCM_VERSION}…" >&2
+  case "${ROCM_VERSION}" in
+    rocm720)
+      if [[ "${gpu_arch}" == "mi35x" ]]; then
+        echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260211-preview"
+      else
+        echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi30x-20260211-preview"
+      fi
+      ;;
+    rocm700)
+      if [[ "${gpu_arch}" == "mi35x" ]]; then
+        echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi35x-20260211"
+      else
+        echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi30x-20260211"
+      fi
+      ;;
+    *)
+      echo "Error: no hard-coded fallback available for ${ROCM_VERSION}" >&2
+      return 1
+      ;;
+  esac
 }
 
 # Pull and run the latest image
diff --git a/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_ar_fusion_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_ar_fusion_eval_mi35x.py
new file mode 100644
index 000000000000..1636d27cf27e
--- /dev/null
+++ b/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_ar_fusion_eval_mi35x.py
@@ -0,0 +1,280 @@
+"""MI35x DeepSeek-R1-MXFP4 GSM8K Completion Evaluation Test with AIter AllReduce Fusion (8-GPU)
+
+Tests DeepSeek-R1-MXFP4 quantized model with --enable-aiter-allreduce-fusion
+using few-shot completion benchmark on MI35x.
+
+Registry: nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion suite
+"""
+
+import ast
+import os
+
+# Set HF cache for MI35x
+os.environ.setdefault("HF_HOME", "/data2/models/huggingface")
+os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub")
+
+import re
+import time
+import unittest
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import numpy as np
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_amd_ci
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+from sglang.utils import download_and_cache_file, read_jsonl
+
+# Register for AMD CI - MI35x DeepSeek-R1-MXFP4 AllReduce Fusion accuracy test (~60 min)
+register_amd_ci(
+    est_time=3600,
+    suite="nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion",
+    nightly=True,
+)
+
+INVALID = -9999999
+
+# Model path configuration for MI35x DeepSeek-R1-MXFP4
+# Priority: 1) env var, 2) local path, 3) HuggingFace model ID
+DEEPSEEK_R1_MXFP4_LOCAL_PATH = "/data2/models/amd-DeepSeek-R1-MXFP4-Preview"
+DEEPSEEK_R1_MXFP4_HF_MODEL_ID = "amd/DeepSeek-R1-MXFP4-Preview"
+
+
+def get_model_path() -> str:
+    """Get effective model path: env var > local path > HF model ID."""
+    env_path = os.environ.get("DEEPSEEK_R1_MXFP4_MODEL_PATH")
+    if env_path:
+        return env_path
+    if os.path.exists(DEEPSEEK_R1_MXFP4_LOCAL_PATH):
+        return DEEPSEEK_R1_MXFP4_LOCAL_PATH
+    return DEEPSEEK_R1_MXFP4_HF_MODEL_ID
+
+
+@dataclass
+class ModelConfig:
+    """Configuration for a model to test."""
+
+    model_path: str
+    tp_size: int = 8
+    accuracy_threshold: float = 0.50
+    other_args: Optional[List[str]] = None
+    env_vars: Optional[dict] = None
+    timeout: Optional[int] = None
+    variant: Optional[str] = None
+
+    def __post_init__(self):
+        if self.other_args is None:
+            self.other_args = []
+        if self.env_vars is None:
+            self.env_vars = {}
+
+    def get_display_name(self) -> str:
+        if self.variant:
+            return f"{self.model_path} ({self.variant})"
+        return self.model_path
+
+
+def get_mxfp4_models() -> List[ModelConfig]:
+    """Get DeepSeek-R1-MXFP4 model configurations for MI35x with AllReduce Fusion."""
+    model_path = get_model_path()
+    return [
+        ModelConfig(
+            model_path=model_path,
+            tp_size=8,
+            accuracy_threshold=0.93,
+            timeout=3600,
+            variant="ar-fusion",
+            other_args=[
+                "--attention-backend",
+                "aiter",
+                "--chunked-prefill-size",
+                "131072",
+                "--disable-radix-cache",
+                "--mem-fraction-static",
+                "0.85",
+                "--trust-remote-code",
+                "--enable-aiter-allreduce-fusion",
+            ],
+            env_vars={"SGLANG_USE_AITER": "1"},
+        ),
+    ]
+
+
+def get_one_example(lines, i, include_answer):
+    """Format a single GSM8K example."""
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    """Get k few-shot examples for prompting."""
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    """Extract numerical answer from response."""
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def run_gsm8k_benchmark(
+    base_url: str,
+    num_questions: int = 200,
+    num_shots: int = 5,
+    parallel: int = 64,
+) -> Tuple[float, float, float]:
+    """Run GSM8K few-shot completion benchmark."""
+    import sglang as sgl
+    from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+
+    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    data_path = download_and_cache_file(url)
+    lines = list(read_jsonl(data_path))
+
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+
+    @sgl.function
+    def few_shot_gsm8k(s, question):
+        s += few_shot_examples + question
+        s += sgl.gen(
+            "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"]
+        )
+
+    backend = RuntimeEndpoint(base_url)
+    sgl.set_default_backend(backend)
+
+    tic = time.perf_counter()
+    states = few_shot_gsm8k.run_batch(
+        arguments, temperature=0, num_threads=parallel, progress_bar=True
+    )
+    latency = time.perf_counter() - tic
+
+    preds = [get_answer_value(states[i]["answer"]) for i in range(len(states))]
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    return float(acc), float(invalid), float(latency)
+
+
+class TestDeepSeekR1MXFP4ArFusionEvalMI35x(unittest.TestCase):
+    """DeepSeek-R1-MXFP4 GSM8K Evaluation with AllReduce Fusion for AMD MI35x."""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = get_mxfp4_models()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.num_questions = int(os.environ.get("GSM8K_NUM_QUESTIONS", "200"))
+
+    def test_deepseek_r1_mxfp4_ar_fusion_accuracy(self):
+        """Test DeepSeek-R1-MXFP4 models with AllReduce Fusion on GSM8K."""
+        # Check if model exists
+        model_path = get_model_path()
+        is_local_path = model_path.startswith("/")
+        if is_local_path and not os.path.exists(model_path):
+            print(f"\n⏭️ SKIPPING: Local model not found at {model_path}")
+            self.skipTest(f"Local model not found at {model_path}")
+            return
+
+        if is_local_path:
+            print(f"📁 Using local model: {model_path}")
+        else:
+            print(f"📥 Using HuggingFace model: {model_path}")
+
+        all_results = []
+        summary = "### DeepSeek-R1-MXFP4 AllReduce Fusion Models (MI35x)\n\n"
+        summary += "| Model | Variant | TP | Accuracy | Threshold | Status |\n"
+        summary += "| ----- | ------- | -- | -------- | --------- | ------ |\n"
+
+        for config in self.models:
+            display_name = config.get_display_name()
+            with self.subTest(model=display_name):
+                print(f"\n{'='*60}")
+                print(f"Testing: {display_name}")
+                print(f"{'='*60}")
+
+                env = os.environ.copy()
+                for key, value in config.env_vars.items():
+                    env[key] = value
+
+                other_args = list(config.other_args)
+                other_args.extend(["--tp", str(config.tp_size)])
+                timeout = config.timeout or DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+
+                try:
+                    process = popen_launch_server(
+                        model=config.model_path,
+                        base_url=self.base_url,
+                        timeout=timeout,
+                        other_args=other_args,
+                        env=env,
+                    )
+
+                    try:
+                        acc, invalid, latency = run_gsm8k_benchmark(
+                            self.base_url, num_questions=self.num_questions
+                        )
+                        passed = acc >= config.accuracy_threshold
+                        status = "✅ PASS" if passed else "❌ FAIL"
+                        print(
+                            f"  accuracy={acc:.3f} threshold={config.accuracy_threshold} {status}"
+                        )
+
+                        all_results.append(
+                            {
+                                "model": display_name,
+                                "accuracy": acc,
+                                "passed": passed,
+                            }
+                        )
+                        summary += f"| {config.model_path} | {config.variant or 'N/A'} | {config.tp_size} | {acc:.3f} | {config.accuracy_threshold} | {status} |\n"
+
+                    finally:
+                        kill_process_tree(process.pid)
+
+                except Exception as e:
+                    summary += f"| {config.model_path} | {config.variant or 'N/A'} | {config.tp_size} | N/A | {config.accuracy_threshold} | ❌ ERROR |\n"
+                    all_results.append(
+                        {
+                            "model": display_name,
+                            "accuracy": None,
+                            "passed": False,
+                            "error": str(e),
+                        }
+                    )
+
+        if is_in_ci():
+            write_github_step_summary(summary)
+
+        failed = [r for r in all_results if not r["passed"]]
+        if failed:
+            raise AssertionError(f"Failed models: {[r['model'] for r in failed]}")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_kv_fp8_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_kv_fp8_eval_mi35x.py
new file mode 100644
index 000000000000..cb54e77528fa
--- /dev/null
+++ b/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_kv_fp8_eval_mi35x.py
@@ -0,0 +1,281 @@
+"""MI35x DeepSeek-R1-MXFP4 GSM8K Completion Evaluation Test with KV Cache FP8 (8-GPU)
+
+Tests DeepSeek-R1-MXFP4 quantized model with --kv-cache-dtype fp8_e4m3
+using few-shot completion benchmark on MI35x.
+
+Registry: nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 suite
+"""
+
+import ast
+import os
+
+# Set HF cache for MI35x
+os.environ.setdefault("HF_HOME", "/data2/models/huggingface")
+os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub")
+
+import re
+import time
+import unittest
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import numpy as np
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_amd_ci
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+from sglang.utils import download_and_cache_file, read_jsonl
+
+# Register for AMD CI - MI35x DeepSeek-R1-MXFP4 KV FP8 accuracy test (~60 min)
+register_amd_ci(
+    est_time=3600,
+    suite="nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8",
+    nightly=True,
+)
+
+INVALID = -9999999
+
+# Model path configuration for MI35x DeepSeek-R1-MXFP4
+# Priority: 1) env var, 2) local path, 3) HuggingFace model ID
+DEEPSEEK_R1_MXFP4_LOCAL_PATH = "/data2/models/amd-DeepSeek-R1-MXFP4-Preview"
+DEEPSEEK_R1_MXFP4_HF_MODEL_ID = "amd/DeepSeek-R1-MXFP4-Preview"
+
+
+def get_model_path() -> str:
+    """Get effective model path: env var > local path > HF model ID."""
+    env_path = os.environ.get("DEEPSEEK_R1_MXFP4_MODEL_PATH")
+    if env_path:
+        return env_path
+    if os.path.exists(DEEPSEEK_R1_MXFP4_LOCAL_PATH):
+        return DEEPSEEK_R1_MXFP4_LOCAL_PATH
+    return DEEPSEEK_R1_MXFP4_HF_MODEL_ID
+
+
+@dataclass
+class ModelConfig:
+    """Configuration for a model to test."""
+
+    model_path: str
+    tp_size: int = 8
+    accuracy_threshold: float = 0.50
+    other_args: Optional[List[str]] = None
+    env_vars: Optional[dict] = None
+    timeout: Optional[int] = None
+    variant: Optional[str] = None
+
+    def __post_init__(self):
+        if self.other_args is None:
+            self.other_args = []
+        if self.env_vars is None:
+            self.env_vars = {}
+
+    def get_display_name(self) -> str:
+        if self.variant:
+            return f"{self.model_path} ({self.variant})"
+        return self.model_path
+
+
+def get_mxfp4_models() -> List[ModelConfig]:
+    """Get DeepSeek-R1-MXFP4 model configurations for MI35x with KV cache FP8."""
+    model_path = get_model_path()
+    return [
+        ModelConfig(
+            model_path=model_path,
+            tp_size=8,
+            accuracy_threshold=0.93,
+            timeout=3600,
+            variant="kv-fp8",
+            other_args=[
+                "--attention-backend",
+                "aiter",
+                "--chunked-prefill-size",
+                "131072",
+                "--disable-radix-cache",
+                "--mem-fraction-static",
+                "0.85",
+                "--trust-remote-code",
+                "--kv-cache-dtype",
+                "fp8_e4m3",
+            ],
+            env_vars={"SGLANG_USE_AITER": "1"},
+        ),
+    ]
+
+
+def get_one_example(lines, i, include_answer):
+    """Format a single GSM8K example."""
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    """Get k few-shot examples for prompting."""
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    """Extract numerical answer from response."""
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def run_gsm8k_benchmark(
+    base_url: str,
+    num_questions: int = 200,
+    num_shots: int = 5,
+    parallel: int = 64,
+) -> Tuple[float, float, float]:
+    """Run GSM8K few-shot completion benchmark."""
+    import sglang as sgl
+    from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+
+    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    data_path = download_and_cache_file(url)
+    lines = list(read_jsonl(data_path))
+
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+
+    @sgl.function
+    def few_shot_gsm8k(s, question):
+        s += few_shot_examples + question
+        s += sgl.gen(
+            "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"]
+        )
+
+    backend = RuntimeEndpoint(base_url)
+    sgl.set_default_backend(backend)
+
+    tic = time.perf_counter()
+    states = few_shot_gsm8k.run_batch(
+        arguments, temperature=0, num_threads=parallel, progress_bar=True
+    )
+    latency = time.perf_counter() - tic
+
+    preds = [get_answer_value(states[i]["answer"]) for i in range(len(states))]
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    return float(acc), float(invalid), float(latency)
+
+
+class TestDeepSeekR1MXFP4KvFp8EvalMI35x(unittest.TestCase):
+    """DeepSeek-R1-MXFP4 GSM8K Evaluation with KV Cache FP8 for AMD MI35x."""
+
+    @classmethod
+    def setUpClass(cls):
+        cls.models = get_mxfp4_models()
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.num_questions = int(os.environ.get("GSM8K_NUM_QUESTIONS", "200"))
+
+    def test_deepseek_r1_mxfp4_kv_fp8_accuracy(self):
+        """Test DeepSeek-R1-MXFP4 models with KV cache FP8 on GSM8K."""
+        # Check if model exists
+        model_path = get_model_path()
+        is_local_path = model_path.startswith("/")
+        if is_local_path and not os.path.exists(model_path):
+            print(f"\n⏭️ SKIPPING: Local model not found at {model_path}")
+            self.skipTest(f"Local model not found at {model_path}")
+            return
+
+        if is_local_path:
+            print(f"📁 Using local model: {model_path}")
+        else:
+            print(f"📥 Using HuggingFace model: {model_path}")
+
+        all_results = []
+        summary = "### DeepSeek-R1-MXFP4 KV FP8 Models (MI35x)\n\n"
+        summary += "| Model | Variant | TP | Accuracy | Threshold | Status |\n"
+        summary += "| ----- | ------- | -- | -------- | --------- | ------ |\n"
+
+        for config in self.models:
+            display_name = config.get_display_name()
+            with self.subTest(model=display_name):
+                print(f"\n{'='*60}")
+                print(f"Testing: {display_name}")
+                print(f"{'='*60}")
+
+                env = os.environ.copy()
+                for key, value in config.env_vars.items():
+                    env[key] = value
+
+                other_args = list(config.other_args)
+                other_args.extend(["--tp", str(config.tp_size)])
+                timeout = config.timeout or DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+
+                try:
+                    process = popen_launch_server(
+                        model=config.model_path,
+                        base_url=self.base_url,
+                        timeout=timeout,
+                        other_args=other_args,
+                        env=env,
+                    )
+
+                    try:
+                        acc, invalid, latency = run_gsm8k_benchmark(
+                            self.base_url, num_questions=self.num_questions
+                        )
+                        passed = acc >= config.accuracy_threshold
+                        status = "✅ PASS" if passed else "❌ FAIL"
+                        print(
+                            f"  accuracy={acc:.3f} threshold={config.accuracy_threshold} {status}"
+                        )
+
+                        all_results.append(
+                            {
+                                "model": display_name,
+                                "accuracy": acc,
+                                "passed": passed,
+                            }
+                        )
+                        summary += f"| {config.model_path} | {config.variant or 'N/A'} | {config.tp_size} | {acc:.3f} | {config.accuracy_threshold} | {status} |\n"
+
+                    finally:
+                        kill_process_tree(process.pid)
+
+                except Exception as e:
+                    summary += f"| {config.model_path} | {config.variant or 'N/A'} | {config.tp_size} | N/A | {config.accuracy_threshold} | ❌ ERROR |\n"
+                    all_results.append(
+                        {
+                            "model": display_name,
+                            "accuracy": None,
+                            "passed": False,
+                            "error": str(e),
+                        }
+                    )
+
+        if is_in_ci():
+            write_github_step_summary(summary)
+
+        failed = [r for r in all_results if not r["passed"]]
+        if failed:
+            raise AssertionError(f"Failed models: {[r['model'] for r in failed]}")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py
new file mode 100644
index 000000000000..a4104cad5ed2
--- /dev/null
+++ b/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py
@@ -0,0 +1,177 @@
+"""MI35x Nightly performance benchmark for DeepSeek-R1-MXFP4 model with AIter AllReduce Fusion.
+
+This test benchmarks the DeepSeek-R1-MXFP4 quantized model on MI35x with 8 GPUs
+using --enable-aiter-allreduce-fusion.
+
+The model path can be configured via DEEPSEEK_R1_MXFP4_MODEL_PATH environment variable.
+
+Registry: nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion suite
+
+Example usage:
+    DEEPSEEK_R1_MXFP4_MODEL_PATH=/data2/models/amd-DeepSeek-R1-MXFP4-Preview python -m pytest test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py -v
+"""
+
+import os
+
+# Set HF cache to /data2/models/ for MI35x so HF models download there
+os.environ.setdefault("HF_HOME", "/data2/models/huggingface")
+os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub")
+import unittest
+from typing import List
+
+from sglang.test.ci.ci_register import register_amd_ci
+from sglang.test.nightly_bench_utils import BenchmarkResult
+from sglang.test.nightly_utils import NightlyBenchmarkRunner
+from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env
+
+# Register for AMD CI - DeepSeek-R1-MXFP4 AllReduce Fusion benchmark on MI35x (~300 min)
+register_amd_ci(
+    est_time=18000,
+    suite="nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion",
+    nightly=True,
+)
+
+
+def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str:
+    """Generate a simplified markdown report without traces and cost columns.
+
+    Skips the first result if it's a warmup run (duplicate batch_size).
+    """
+    model_header = results[0].model_path
+    if results[0].run_name and results[0].run_name != "default":
+        model_header += f" ({results[0].run_name})"
+
+    gpu_config = os.getenv("GPU_CONFIG", "MI35x")
+    if gpu_config:
+        model_header += f" [{gpu_config}]"
+
+    summary = f"### {model_header}\n"
+    summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n"
+    summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n"
+
+    # Skip first result if it's a warmup (same batch_size as second result)
+    report_results = (
+        results[1:]
+        if len(results) > 1 and results[0].batch_size == results[1].batch_size
+        else results
+    )
+
+    for result in report_results:
+        itl = 1 / (result.output_throughput / result.batch_size) * 1000
+        summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n"
+
+    return summary
+
+
+# Model path configuration for MI35x DeepSeek-R1-MXFP4
+# Priority: 1) env var, 2) local path, 3) HuggingFace model ID
+DEEPSEEK_R1_MXFP4_LOCAL_PATH = "/data2/models/amd-DeepSeek-R1-MXFP4-Preview"
+DEEPSEEK_R1_MXFP4_HF_MODEL_ID = "amd/DeepSeek-R1-MXFP4-Preview"
+PROFILE_DIR = "performance_profiles_deepseek_r1_mxfp4_ar_fusion_mi35x"
+
+
+def get_model_path() -> str:
+    """Get effective model path: env var > local path > HF model ID."""
+    # Check env var first
+    env_path = os.environ.get("DEEPSEEK_R1_MXFP4_MODEL_PATH")
+    if env_path:
+        return env_path
+    # Check local path
+    if os.path.exists(DEEPSEEK_R1_MXFP4_LOCAL_PATH):
+        return DEEPSEEK_R1_MXFP4_LOCAL_PATH
+    # Fall back to HF model ID
+    return DEEPSEEK_R1_MXFP4_HF_MODEL_ID
+
+
+class TestDeepseekR1MXFP4ArFusionPerfMI35x(unittest.TestCase):
+    """MI35x Nightly performance benchmark for DeepSeek-R1-MXFP4 with AllReduce Fusion.
+
+    Tests the DeepSeek-R1-MXFP4 quantized model on TP=8 with --enable-aiter-allreduce-fusion.
+    Uses local path if available, otherwise downloads from HuggingFace.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = get_model_path()
+        print(f"Using model path: {cls.model}")
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.batch_sizes = [1, 8, 16, 64]
+        cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
+        cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512"))
+
+        cls.variants = [
+            {
+                "name": "ar-fusion",
+                "other_args": [
+                    "--trust-remote-code",
+                    "--tp",
+                    "8",
+                    "--chunked-prefill-size",
+                    "131072",
+                    "--disable-radix-cache",
+                    "--mem-fraction-static",
+                    "0.85",
+                    "--enable-aiter-allreduce-fusion",
+                ],
+            },
+        ]
+
+        cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url)
+        cls.runner.setup_profile_directory()
+        cls.runner.full_report = f"## {cls.__name__}\n"
+
+    def test_bench_one_batch(self):
+        """Run benchmark across all configured variants."""
+        failed_variants = []
+
+        is_local_path = self.model.startswith("/")
+        if is_local_path and not os.path.exists(self.model):
+            print(f"\n⏭️ SKIPPING: Local model not found at {self.model}")
+            self.runner.full_report += (
+                f"\n⏭️ Test skipped: Local model not found at {self.model}\n"
+            )
+            self.runner.write_final_report()
+            return
+
+        if is_local_path:
+            print(f"📁 Using local model: {self.model}")
+        else:
+            print(
+                f"📥 Using HuggingFace model: {self.model} (will download if not cached)"
+            )
+
+        try:
+            for variant_config in self.variants:
+                with self.subTest(variant=variant_config["name"]):
+                    result_tuple = self.runner.run_benchmark_for_model(
+                        model_path=self.model,
+                        batch_sizes=self.batch_sizes,
+                        input_lens=self.input_lens,
+                        output_lens=self.output_lens,
+                        other_args=variant_config["other_args"],
+                        variant=variant_config["name"],
+                        extra_bench_args=["--trust-remote-code"],
+                        enable_profile=False,
+                    )
+                    results = result_tuple[0]
+                    success = result_tuple[1]
+
+                    if not success:
+                        failed_variants.append(variant_config["name"])
+
+                    if results:
+                        self.runner.full_report += (
+                            generate_simple_markdown_report(results) + "\n"
+                        )
+        finally:
+            self.runner.write_final_report()
+
+        if failed_variants:
+            raise AssertionError(
+                f"Benchmark failed for {self.model} with the following variants: "
+                f"{', '.join(failed_variants)}"
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py
new file mode 100644
index 000000000000..fe77478a2de9
--- /dev/null
+++ b/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py
@@ -0,0 +1,178 @@
+"""MI35x Nightly performance benchmark for DeepSeek-R1-MXFP4 model with KV Cache FP8.
+
+This test benchmarks the DeepSeek-R1-MXFP4 quantized model on MI35x with 8 GPUs
+using --kv-cache-dtype fp8_e4m3.
+
+The model path can be configured via DEEPSEEK_R1_MXFP4_MODEL_PATH environment variable.
+
+Registry: nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 suite
+
+Example usage:
+    DEEPSEEK_R1_MXFP4_MODEL_PATH=/data2/models/amd-DeepSeek-R1-MXFP4-Preview python -m pytest test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py -v
+"""
+
+import os
+
+# Set HF cache to /data2/models/ for MI35x so HF models download there
+os.environ.setdefault("HF_HOME", "/data2/models/huggingface")
+os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub")
+import unittest
+from typing import List
+
+from sglang.test.ci.ci_register import register_amd_ci
+from sglang.test.nightly_bench_utils import BenchmarkResult
+from sglang.test.nightly_utils import NightlyBenchmarkRunner
+from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env
+
+# Register for AMD CI - DeepSeek-R1-MXFP4 KV FP8 benchmark on MI35x (~300 min)
+register_amd_ci(
+    est_time=18000,
+    suite="nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8",
+    nightly=True,
+)
+
+
+def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str:
+    """Generate a simplified markdown report without traces and cost columns.
+
+    Skips the first result if it's a warmup run (duplicate batch_size).
+    """
+    model_header = results[0].model_path
+    if results[0].run_name and results[0].run_name != "default":
+        model_header += f" ({results[0].run_name})"
+
+    gpu_config = os.getenv("GPU_CONFIG", "MI35x")
+    if gpu_config:
+        model_header += f" [{gpu_config}]"
+
+    summary = f"### {model_header}\n"
+    summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n"
+    summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n"
+
+    # Skip first result if it's a warmup (same batch_size as second result)
+    report_results = (
+        results[1:]
+        if len(results) > 1 and results[0].batch_size == results[1].batch_size
+        else results
+    )
+
+    for result in report_results:
+        itl = 1 / (result.output_throughput / result.batch_size) * 1000
+        summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n"
+
+    return summary
+
+
+# Model path configuration for MI35x DeepSeek-R1-MXFP4
+# Priority: 1) env var, 2) local path, 3) HuggingFace model ID
+DEEPSEEK_R1_MXFP4_LOCAL_PATH = "/data2/models/amd-DeepSeek-R1-MXFP4-Preview"
+DEEPSEEK_R1_MXFP4_HF_MODEL_ID = "amd/DeepSeek-R1-MXFP4-Preview"
+PROFILE_DIR = "performance_profiles_deepseek_r1_mxfp4_kv_fp8_mi35x"
+
+
+def get_model_path() -> str:
+    """Get effective model path: env var > local path > HF model ID."""
+    # Check env var first
+    env_path = os.environ.get("DEEPSEEK_R1_MXFP4_MODEL_PATH")
+    if env_path:
+        return env_path
+    # Check local path
+    if os.path.exists(DEEPSEEK_R1_MXFP4_LOCAL_PATH):
+        return DEEPSEEK_R1_MXFP4_LOCAL_PATH
+    # Fall back to HF model ID
+    return DEEPSEEK_R1_MXFP4_HF_MODEL_ID
+
+
+class TestDeepseekR1MXFP4KvFp8PerfMI35x(unittest.TestCase):
+    """MI35x Nightly performance benchmark for DeepSeek-R1-MXFP4 with KV Cache FP8.
+
+    Tests the DeepSeek-R1-MXFP4 quantized model on TP=8 with --kv-cache-dtype fp8_e4m3.
+    Uses local path if available, otherwise downloads from HuggingFace.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = get_model_path()
+        print(f"Using model path: {cls.model}")
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.batch_sizes = [1, 8, 16, 64]
+        cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
+        cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512"))
+
+        cls.variants = [
+            {
+                "name": "kv-fp8",
+                "other_args": [
+                    "--trust-remote-code",
+                    "--tp",
+                    "8",
+                    "--chunked-prefill-size",
+                    "131072",
+                    "--disable-radix-cache",
+                    "--mem-fraction-static",
+                    "0.85",
+                    "--kv-cache-dtype",
+                    "fp8_e4m3",
+                ],
+            },
+        ]
+
+        cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url)
+        cls.runner.setup_profile_directory()
+        cls.runner.full_report = f"## {cls.__name__}\n"
+
+    def test_bench_one_batch(self):
+        """Run benchmark across all configured variants."""
+        failed_variants = []
+
+        is_local_path = self.model.startswith("/")
+        if is_local_path and not os.path.exists(self.model):
+            print(f"\n⏭️ SKIPPING: Local model not found at {self.model}")
+            self.runner.full_report += (
+                f"\n⏭️ Test skipped: Local model not found at {self.model}\n"
+            )
+            self.runner.write_final_report()
+            return
+
+        if is_local_path:
+            print(f"📁 Using local model: {self.model}")
+        else:
+            print(
+                f"📥 Using HuggingFace model: {self.model} (will download if not cached)"
+            )
+
+        try:
+            for variant_config in self.variants:
+                with self.subTest(variant=variant_config["name"]):
+                    result_tuple = self.runner.run_benchmark_for_model(
+                        model_path=self.model,
+                        batch_sizes=self.batch_sizes,
+                        input_lens=self.input_lens,
+                        output_lens=self.output_lens,
+                        other_args=variant_config["other_args"],
+                        variant=variant_config["name"],
+                        extra_bench_args=["--trust-remote-code"],
+                        enable_profile=False,
+                    )
+                    results = result_tuple[0]
+                    success = result_tuple[1]
+
+                    if not success:
+                        failed_variants.append(variant_config["name"])
+
+                    if results:
+                        self.runner.full_report += (
+                            generate_simple_markdown_report(results) + "\n"
+                        )
+        finally:
+            self.runner.write_final_report()
+
+        if failed_variants:
+            raise AssertionError(
+                f"Benchmark failed for {self.model} with the following variants: "
+                f"{', '.join(failed_variants)}"
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/amd/test_deepseek_v3_basic_kv_fp8.py b/test/registered/amd/test_deepseek_v3_basic_kv_fp8.py
new file mode 100644
index 000000000000..601c07cee183
--- /dev/null
+++ b/test/registered/amd/test_deepseek_v3_basic_kv_fp8.py
@@ -0,0 +1,86 @@
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_amd_ci
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.send_one import BenchArgs, send_one_prompt
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+
+register_amd_ci(
+    est_time=1200, suite="nightly-amd-8-gpu-deepseek-v3-kv-fp8", nightly=True
+)
+
+FULL_DEEPSEEK_V3_MODEL_PATH = "deepseek-ai/DeepSeek-V3-0324"
+
+
+class TestDeepseekV3BasicKvFp8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = FULL_DEEPSEEK_V3_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--trust-remote-code",
+            "--tp",
+            "8",
+            "--kv-cache-dtype",
+            "fp8_e4m3",
+            "--model-loader-extra-config",
+            '{"enable_multithread_load": true, "num_threads": 64}',
+        ]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH * 5,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_a_gsm8k(
+        self,
+    ):  # Append an "a" to make this test run first (alphabetically) to warm up the server
+        args = SimpleNamespace(
+            num_shots=8,
+            data_path=None,
+            num_questions=1400,
+            parallel=1400,
+            max_new_tokens=512,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_gsm8k (deepseek-v3 kv-fp8)\n" f'{metrics["accuracy"]=:.3f}\n'
+            )
+            self.assertGreater(metrics["accuracy"], 0.93)
+
+    def test_bs_1_speed(self):
+        args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048)
+        acc_length, speed = send_one_prompt(args)
+
+        print(f"{speed=:.2f}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_bs_1_speed (deepseek-v3 kv-fp8)\n" f"{speed=:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(speed, 40)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py b/test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py
new file mode 100644
index 000000000000..a62eadf7a587
--- /dev/null
+++ b/test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py
@@ -0,0 +1,116 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_amd_ci
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.send_one import BenchArgs, send_one_prompt
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_amd_ci,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+
+register_amd_ci(
+    est_time=1200, suite="nightly-amd-8-gpu-deepseek-v3-kv-fp8", nightly=True
+)
+
+FULL_DEEPSEEK_V3_MODEL_PATH = "deepseek-ai/DeepSeek-V3-0324"
+
+
+class TestDeepseekV3MTPKvFp8(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = FULL_DEEPSEEK_V3_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--tp",
+            "8",
+            "--trust-remote-code",
+            "--kv-cache-dtype",
+            "fp8_e4m3",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "1",
+            "--speculative-num-draft-tokens",
+            "4",
+            "--model-loader-extra-config",
+            '{"enable_multithread_load": true, "num_threads": 64}',
+        ]
+        if not is_in_amd_ci():
+            other_args += ["--mem-frac", "0.7"]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH * 5,
+            other_args=other_args,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_a_gsm8k(
+        self,
+    ):  # Append an "a" to make this test run first (alphabetically) to warm up the server
+        requests.get(self.base_url + "/flush_cache")
+
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_gsm8k (deepseek-v3 mtp kv-fp8)\n"
+                f'{metrics["accuracy"]=:.3f}\n'
+                f"{avg_spec_accept_length=:.2f}\n"
+            )
+            self.assertGreater(metrics["accuracy"], 0.93)
+            if is_in_amd_ci():
+                self.assertGreater(avg_spec_accept_length, 2.8)
+
+    def test_bs_1_speed(self):
+        args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048)
+        acc_length, speed = send_one_prompt(args)
+
+        print(f"{acc_length=:.2f} {speed=:.2f}")
+
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_bs_1_speed (deepseek-v3 mtp kv-fp8)\n"
+                f"{acc_length=:.2f}\n"
+                f"{speed=:.2f} token/s\n"
+            )
+            if is_in_amd_ci():
+                self.assertGreater(acc_length, 2.8)
+            else:
+                self.assertGreater(acc_length, 2.9)
+            if is_in_amd_ci():
+                self.assertGreater(speed, 90)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/registered/ops/test_aiter_allreduce_fusion_amd.py b/test/registered/ops/test_aiter_allreduce_fusion_amd.py
index 5813ab09348b..3fe3e9b19753 100644
--- a/test/registered/ops/test_aiter_allreduce_fusion_amd.py
+++ b/test/registered/ops/test_aiter_allreduce_fusion_amd.py
@@ -10,8 +10,7 @@
 
 from sglang.test.ci.ci_register import register_amd_ci
 
-# Dedicated AMD 8-GPU suite for AITER fused allreduce+rmsnorm validation.
-register_amd_ci(est_time=240, suite="stage-c-test-aiter-fusion-8-gpu-amd")
+register_amd_ci(est_time=240, suite="stage-c-test-large-8-gpu-amd")
 
 
 class TestAiterAllreduceFusionAmd(unittest.TestCase):
diff --git a/test/run_suite.py b/test/run_suite.py
index 2f45522aa9b0..d4092dd73d41 100644
--- a/test/run_suite.py
+++ b/test/run_suite.py
@@ -26,7 +26,7 @@
         "stage-b-test-large-8-gpu-35x-disaggregation-amd",
         "stage-b-test-large-1-gpu-amd",
         "stage-b-test-large-2-gpu-amd",
-        "stage-c-test-aiter-fusion-8-gpu-amd",
+        "stage-c-test-large-8-gpu-amd",
         "stage-c-test-large-8-gpu-amd-mi35x",
     ],
     HWBackend.CUDA: [