diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml
index 3902971c2de8..e4250fbc10e0 100644
--- a/.github/workflows/nightly-test-amd.yml
+++ b/.github/workflows/nightly-test-amd.yml
@@ -22,12 +22,17 @@ on:
           - 'nightly-test-8-gpu-gpt-oss'
           - 'nightly-test-8-gpu-grok'
           - 'nightly-test-8-gpu-deepseek-r1'
-          - 'nightly-test-8-gpu-deepseek-v3-dp'
-          - 'nightly-test-8-gpu-deepseek-v3-tc'
-          - 'nightly-test-8-gpu-deepseek-v3-mtp'
           - 'nightly-perf-8-gpu-grok'
           - 'nightly-perf-8-gpu-deepseek-v3'
           - 'nightly-perf-8-gpu-deepseek-v31'
+          # MI35x jobs
+          - 'nightly-test-2-gpu-mi35x'
+          - 'nightly-test-2-gpu-vlm-mi35x'
+          - 'nightly-test-8-gpu-mi35x-gpt-oss'
+          - 'nightly-test-8-gpu-mi35x-grok'
+          - 'nightly-test-8-gpu-mi35x-deepseek-r1'
+          - 'nightly-perf-8-gpu-mi35x-grok'
+          - 'nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4'
   workflow_call:
     inputs:
       ref:
@@ -68,7 +73,9 @@ jobs:
 
       - name: Nightly Test (2-GPU)
         run: |
-          bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
@@ -95,7 +102,9 @@ jobs:
       - name: Nightly Test (2-GPU VLM MMMU)
         timeout-minutes: 180
         run: |
-          bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-vlm --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-vlm --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
@@ -121,7 +130,10 @@ jobs:
 
       - name: Nightly Test (8-GPU GPT-OSS)
         run: |
-          bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=gpt-oss -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e AMD_TEST_MODEL_GROUP=gpt-oss \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
@@ -147,11 +159,14 @@ jobs:
 
       - name: Nightly Test (8-GPU GROK)
         run: |
-          bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=grok -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e AMD_TEST_MODEL_GROUP=grok \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
-  # 8-GPU tests (TP=8) - DeepSeek-R1 (reasoning model)
+  # 8-GPU tests (TP=8) - DeepSeek-R1 all variants (basic, MTP, DP, TC)
   nightly-test-8-gpu-deepseek-r1:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-deepseek-r1')
     runs-on: linux-mi325-gpu-8
@@ -171,15 +186,18 @@ jobs:
       - name: Install dependencies
         run: bash scripts/ci/amd_ci_install_dependency.sh
 
-      - name: Nightly Test (8-GPU DeepSeek-R1)
+      - name: Nightly Test (8-GPU DeepSeek-R1 all variants)
         run: |
-          bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=deepseek-r1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e AMD_TEST_MODEL_GROUP=deepseek-r1-all \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
-  # 8-GPU tests (TP=8) - DeepSeek-V3 + DP Attention (requires ROCm 7.0+)
-  nightly-test-8-gpu-deepseek-v3-dp:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-deepseek-v3-dp')
+  # 8-GPU Performance Tests (TP=8) - Grok (Grok-1 + Grok-2) performance benchmarks
+  nightly-perf-8-gpu-grok:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-grok')
     runs-on: linux-mi325-gpu-8
     steps:
       - name: Checkout code
@@ -197,15 +215,16 @@ jobs:
       - name: Install dependencies
         run: bash scripts/ci/amd_ci_install_dependency.sh
 
-      - name: Nightly Test (8-GPU DeepSeek-V3 + DP Attention)
+      - name: Nightly Perf Test (8-GPU Grok-1 + Grok-2)
+        timeout-minutes: 60
         run: |
-          bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=deepseek-v3-dp -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e RCCL_MSCCL_ENABLE=0 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_grok_perf.py || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
-  # 8-GPU tests (TP=8) - DeepSeek-V3 + Torch Compile (requires ROCm 7.0+)
-  nightly-test-8-gpu-deepseek-v3-tc:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-deepseek-v3-tc')
+  # 8-GPU Performance Tests (TP=8) - DeepSeek-V3 performance benchmarks
+  nightly-perf-8-gpu-deepseek-v3:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-deepseek-v3')
     runs-on: linux-mi325-gpu-8
     steps:
       - name: Checkout code
@@ -223,15 +242,16 @@ jobs:
       - name: Install dependencies
         run: bash scripts/ci/amd_ci_install_dependency.sh
 
-      - name: Nightly Test (8-GPU DeepSeek-V3 + Torch Compile)
+      - name: Nightly Perf Test (8-GPU DeepSeek-V3)
+        timeout-minutes: 300
         run: |
-          bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=deepseek-v3-tc -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e SGLANG_USE_ROCM700A=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_deepseek_v3_perf.py || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
-  # 8-GPU tests (TP=8) - DeepSeek-V3 + MTP/EAGLE (requires ROCm 7.0+)
-  nightly-test-8-gpu-deepseek-v3-mtp:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-deepseek-v3-mtp')
+  # 8-GPU Performance Tests (TP=8) - DeepSeek-V3.1 performance benchmarks
+  nightly-perf-8-gpu-deepseek-v31:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-deepseek-v31')
     runs-on: linux-mi325-gpu-8
     steps:
       - name: Checkout code
@@ -249,16 +269,18 @@ jobs:
       - name: Install dependencies
         run: bash scripts/ci/amd_ci_install_dependency.sh
 
-      - name: Nightly Test (8-GPU DeepSeek-V3 + MTP)
+      - name: Nightly Perf Test (8-GPU DeepSeek-V3.1)
+        timeout-minutes: 300
         run: |
-          bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=deepseek-v3-mtp -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e SGLANG_USE_ROCM700A=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_deepseek_v31_perf.py || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
-  # 8-GPU Performance Tests (TP=8) - Grok (Grok-1 + Grok-2) performance benchmarks
-  nightly-perf-8-gpu-grok:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-grok')
-    runs-on: linux-mi325-gpu-8
+  # ============================================== MI35x Tests ==============================================
+  # MI35x 2-GPU tests (TP=2) - Reuses nightly-amd suite
+  nightly-test-2-gpu-mi35x:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-2-gpu-mi35x')
+    runs-on: linux-mi35x-gpu-2
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -273,19 +295,23 @@ jobs:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/ci/amd_ci_install_dependency.sh
+        run: |
+          bash scripts/ci/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd_ci_exec.sh pip install tabulate
 
-      - name: Nightly Perf Test (8-GPU Grok-1 + Grok-2)
-        timeout-minutes: 60
+      - name: Nightly Test (2-GPU)
         run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e RCCL_MSCCL_ENABLE=0 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_grok_perf.py || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
-  # 8-GPU Performance Tests (TP=8) - DeepSeek-V3 performance benchmarks
-  nightly-perf-8-gpu-deepseek-v3:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-deepseek-v3')
-    runs-on: linux-mi325-gpu-8
+  # MI35x 2-GPU VLM tests - Reuses nightly-amd-vlm suite
+  nightly-test-2-gpu-vlm-mi35x:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-2-gpu-vlm-mi35x')
+    runs-on: linux-mi35x-gpu-2
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -300,19 +326,24 @@ jobs:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/ci/amd_ci_install_dependency.sh
+        run: |
+          bash scripts/ci/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd_ci_exec.sh pip install tabulate
 
-      - name: Nightly Perf Test (8-GPU DeepSeek-V3)
-        timeout-minutes: 300
+      - name: Nightly Test (2-GPU VLM MMMU)
+        timeout-minutes: 180
         run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e SGLANG_USE_ROCM700A=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_deepseek_v3_perf.py || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-vlm --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
-  # 8-GPU Performance Tests (TP=8) - DeepSeek-V3.1 performance benchmarks
-  nightly-perf-8-gpu-deepseek-v31:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-deepseek-v31')
-    runs-on: linux-mi325-gpu-8
+  # MI35x 8-GPU tests (TP=8) - GPT-OSS models (MI35x uses openai/* paths)
+  nightly-test-8-gpu-mi35x-gpt-oss:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-mi35x-gpt-oss')
+    runs-on: linux-mi35x-gpu-8
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -327,29 +358,168 @@ jobs:
           GITHUB_WORKSPACE: ${{ github.workspace }}
 
       - name: Install dependencies
-        run: bash scripts/ci/amd_ci_install_dependency.sh
+        run: |
+          bash scripts/ci/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd_ci_exec.sh pip install tabulate
 
-      - name: Nightly Perf Test (8-GPU DeepSeek-V3.1)
+      - name: Nightly Test MI35x (8-GPU GPT-OSS)
+        timeout-minutes: 180
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e AMD_TEST_MODEL_GROUP=gpt-oss \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU tests (TP=8) - GROK models
+  nightly-test-8-gpu-mi35x-grok:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-mi35x-grok')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd_ci_exec.sh pip install tabulate
+
+      - name: Nightly Test MI35x (8-GPU GROK)
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e AMD_TEST_MODEL_GROUP=grok \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU tests (TP=8) - DeepSeek-R1-0528 basic + MTP only
+  # Same model as MI300X for consistency; MXFP4 only used for perf tests
+  # Note: DP/TC variants disabled for MI35x due to initialization timeouts
+  nightly-test-8-gpu-mi35x-deepseek-r1:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-mi35x-deepseek-r1')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd_ci_exec.sh pip install tabulate
+
+      - name: Nightly Test MI35x (8-GPU DeepSeek-R1-0528 basic + MTP)
+        timeout-minutes: 180
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e AMD_TEST_MODEL_GROUP=deepseek-r1 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU Performance Tests (TP=8) - Grok performance benchmarks
+  nightly-perf-8-gpu-mi35x-grok:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-grok')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd_ci_exec.sh pip install tabulate
+
+      - name: Nightly Perf Test MI35x (8-GPU Grok)
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e RCCL_MSCCL_ENABLE=0 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_grok_perf.py || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU Performance Tests (TP=8) - DeepSeek-R1-MXFP4 performance benchmarks
+  nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd_ci_exec.sh pip install tabulate
+
+      - name: Nightly Perf Test MI35x (8-GPU DeepSeek-R1-MXFP4)
         timeout-minutes: 300
         run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e SGLANG_USE_ROCM700A=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_deepseek_v31_perf.py || TEST_EXIT_CODE=$?
+          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_deepseek_r1_mxfp4_perf.py || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
   check-all-jobs:
     if: always() && (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch')
     needs:
+      # MI325 jobs
       - nightly-test-2-gpu
       - nightly-test-2-gpu-vlm
       - nightly-test-8-gpu-gpt-oss
       - nightly-test-8-gpu-grok
-      - nightly-test-8-gpu-deepseek-v3-dp
-      - nightly-test-8-gpu-deepseek-v3-tc
-      - nightly-test-8-gpu-deepseek-v3-mtp
       - nightly-test-8-gpu-deepseek-r1
       - nightly-perf-8-gpu-grok
       - nightly-perf-8-gpu-deepseek-v3
       - nightly-perf-8-gpu-deepseek-v31
+      # MI35x jobs
+      - nightly-test-2-gpu-mi35x
+      - nightly-test-2-gpu-vlm-mi35x
+      - nightly-test-8-gpu-mi35x-gpt-oss
+      - nightly-test-8-gpu-mi35x-grok
+      - nightly-test-8-gpu-mi35x-deepseek-r1
+      - nightly-perf-8-gpu-mi35x-grok
+      - nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4
     runs-on: ubuntu-latest
     steps:
       - name: Check if any job failed
diff --git a/test/srt/nightly/test_gsm8k_completion_eval_amd.py b/test/registered/amd/nightly/test_gsm8k_completion_eval_amd.py
similarity index 91%
rename from test/srt/nightly/test_gsm8k_completion_eval_amd.py
rename to test/registered/amd/nightly/test_gsm8k_completion_eval_amd.py
index bf5a55f853f8..b44eb04efc5f 100644
--- a/test/srt/nightly/test_gsm8k_completion_eval_amd.py
+++ b/test/registered/amd/nightly/test_gsm8k_completion_eval_amd.py
@@ -1,5 +1,5 @@
 """
-AMD GSM8K Completion Evaluation Test
+AMD GSM8K Completion Evaluation Test (Migrated from test/srt/nightly/)
 
 This test uses the completion-based gsm8k benchmark (few-shot prompting)
 which works with base models that don't have chat templates.
@@ -20,6 +20,8 @@
 - "deepseek-v3-mtp": DeepSeek-V3 with MTP/EAGLE (nightly-amd-8-gpu-deepseek-v3-mtp)
 - "deepseek-r1": DeepSeek-R1 reasoning model (nightly-amd-8-gpu-deepseek-r1)
 - "all": All models
+
+Registry: nightly-amd-8-gpu suite (8-GPU tests)
 """
 
 import ast
@@ -44,6 +46,7 @@
     print("[WARNING] huggingface_hub not available - model cache checking disabled")
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_amd_ci
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
@@ -53,6 +56,9 @@
 )
 from sglang.utils import download_and_cache_file, read_jsonl
 
+# Register for AMD CI - GSM8K completion tests (~120 min)
+register_amd_ci(est_time=7200, suite="nightly-amd-8-gpu", nightly=True)
+
 INVALID = -9999999
 
 
@@ -67,6 +73,9 @@ class BaseModelConfig:
     env_vars: Optional[dict] = None
     tokenizer_path: Optional[str] = None
     timeout: Optional[int] = None  # Custom timeout for server launch (seconds)
+    variant: Optional[str] = (
+        None  # Test variant name (e.g., "basic", "MTP", "DP", "TC")
+    )
 
     def __post_init__(self):
         if self.other_args is None:
@@ -74,6 +83,12 @@ def __post_init__(self):
         if self.env_vars is None:
             self.env_vars = {}
 
+    def get_display_name(self) -> str:
+        """Return display name for logs/summary (model + variant if set)."""
+        if self.variant:
+            return f"{self.model_path} ({self.variant})"
+        return self.model_path
+
 
 # =============================================================================
 # MODEL GROUPS - Each group runs on a separate 8-GPU runner
@@ -193,84 +208,80 @@ def __post_init__(self):
     ),
 ]
 
-# Group 3: DeepSeek-V3 with DP Attention
-# Runner: nightly-amd-8-gpu-deepseek-v3-dp
-# Note: Uses DP attention (dp-size=8) for better performance, requires ROCm 7.0+
-AMD_DEEPSEEK_V3_DP_MODELS = [
-    # DeepSeek-V3-0324 with DP attention
+# Note: DeepSeek-V3 accuracy tests removed - V3 only used for perf tests
+# See test_deepseek_v3_perf.py and test_deepseek_v31_perf.py for V3 perf tests
+
+# Group 3: DeepSeek-R1 (reasoning model) - Basic + MTP combined
+# Runner: nightly-amd-8-gpu-deepseek-r1
+AMD_DEEPSEEK_R1_MODELS = [
+    # DeepSeek-R1-0528 basic - reasoning model, ~80GB per GPU
     BaseModelConfig(
-        model_path="deepseek-ai/DeepSeek-V3-0324",
+        model_path="deepseek-ai/DeepSeek-R1-0528",
         tp_size=8,
         accuracy_threshold=0.93,
         timeout=3600,  # 1 hour for large model
+        variant="basic",
         other_args=[
+            "--attention-backend",
+            "aiter",
             "--chunked-prefill-size",
             "131072",
-            "--dp-size",
-            "8",
-            "--enable-dp-attention",
+            "--disable-radix-cache",
             "--mem-fraction-static",
             "0.85",
             "--trust-remote-code",
         ],
         env_vars={
-            "SGLANG_USE_ROCM700A": "1",
             "SGLANG_USE_AITER": "1",
         },
     ),
-]
-
-# Group 3b: DeepSeek-V3 with Torch Compile
-# Runner: nightly-amd-8-gpu-deepseek-v3-tc
-# Note: Uses torch compile for performance optimization, requires ROCm 7.0+
-AMD_DEEPSEEK_V3_TC_MODELS = [
-    # DeepSeek-V3-0324 with torch compile
+    # DeepSeek-R1-0528 with MTP (EAGLE speculative decoding)
     BaseModelConfig(
-        model_path="deepseek-ai/DeepSeek-V3-0324",
+        model_path="deepseek-ai/DeepSeek-R1-0528",
         tp_size=8,
         accuracy_threshold=0.93,
-        timeout=7200,  # 2 hours for compilation + large model
+        timeout=3600,
+        variant="MTP",
         other_args=[
             "--chunked-prefill-size",
             "131072",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "1",
+            "--speculative-num-draft-tokens",
+            "4",
             "--mem-fraction-static",
-            "0.70",  # Reduced further for torch compile
-            "--cuda-graph-max-bs",
-            "8",  # Reduced from 16 to reduce memory
-            "--enable-torch-compile",
-            "--disable-cuda-graph",  # Disable cuda graph to avoid memory issues
+            "0.7",
             "--trust-remote-code",
         ],
         env_vars={
-            "SGLANG_USE_ROCM700A": "1",
             "SGLANG_USE_AITER": "1",
         },
     ),
 ]
 
-# Group 3c: DeepSeek-V3 with MTP (EAGLE speculative decoding)
-# Runner: nightly-amd-8-gpu-deepseek-v3-mtp
-# Note: Uses MTP for improved throughput, requires ROCm 7.0+
-AMD_DEEPSEEK_V3_MTP_MODELS = [
-    # DeepSeek-V3-0324 with MTP (EAGLE speculative decoding)
+# Group 5: DeepSeek-R1 with DP + TC combined
+# Runner: nightly-amd-8-gpu-deepseek-r1-dp-tc
+# Combines DP attention and Torch Compile tests for DeepSeek-R1
+AMD_DEEPSEEK_R1_DP_TC_MODELS = [
+    # DeepSeek-R1-0528 with DP attention
     BaseModelConfig(
-        model_path="deepseek-ai/DeepSeek-V3-0324",
+        model_path="deepseek-ai/DeepSeek-R1-0528",
         tp_size=8,
         accuracy_threshold=0.93,
-        timeout=3600,  # 1 hour for large model
+        timeout=3600,
+        variant="DP",
         other_args=[
             "--chunked-prefill-size",
             "131072",
-            "--speculative-algorithm",
-            "EAGLE",
-            "--speculative-num-steps",
-            "3",
-            "--speculative-eagle-topk",
-            "1",
-            "--speculative-num-draft-tokens",
-            "4",
+            "--dp-size",
+            "8",
+            "--enable-dp-attention",
             "--mem-fraction-static",
-            "0.7",
+            "0.85",
             "--trust-remote-code",
         ],
         env_vars={
@@ -278,28 +289,26 @@ def __post_init__(self):
             "SGLANG_USE_AITER": "1",
         },
     ),
-]
-
-# Group 4: DeepSeek-R1 (reasoning model)
-# Runner: nightly-amd-8-gpu-deepseek-r1
-AMD_DEEPSEEK_R1_MODELS = [
-    # DeepSeek-R1-0528 - reasoning model, ~80GB per GPU
+    # DeepSeek-R1-0528 with torch compile
     BaseModelConfig(
         model_path="deepseek-ai/DeepSeek-R1-0528",
         tp_size=8,
         accuracy_threshold=0.93,
-        timeout=3600,  # 1 hour for large model
+        timeout=7200,  # 2 hours for compilation
+        variant="TC",
         other_args=[
-            "--attention-backend",
-            "aiter",
             "--chunked-prefill-size",
             "131072",
-            "--disable-radix-cache",
             "--mem-fraction-static",
-            "0.85",
+            "0.70",
+            "--cuda-graph-max-bs",
+            "8",
+            "--enable-torch-compile",
+            "--disable-cuda-graph",
             "--trust-remote-code",
         ],
         env_vars={
+            "SGLANG_USE_ROCM700A": "1",
             "SGLANG_USE_AITER": "1",
         },
     ),
@@ -312,27 +321,28 @@ def get_model_group() -> str:
 
 
 def get_models_for_group(group: str) -> List[BaseModelConfig]:
-    """Get the list of models for a given group."""
+    """Get the list of models for a given group.
+
+    Note: DeepSeek-V3 is only used for perf tests, not accuracy tests.
+    See test_deepseek_v3_perf.py and test_deepseek_v31_perf.py.
+    """
     if group == "gpt-oss":
         return AMD_GPT_OSS_MODELS
     elif group == "grok":
         return AMD_GROK_MODELS
-    elif group == "deepseek-v3-dp":
-        return AMD_DEEPSEEK_V3_DP_MODELS
-    elif group == "deepseek-v3-tc":
-        return AMD_DEEPSEEK_V3_TC_MODELS
-    elif group == "deepseek-v3-mtp":
-        return AMD_DEEPSEEK_V3_MTP_MODELS
     elif group == "deepseek-r1":
         return AMD_DEEPSEEK_R1_MODELS
+    elif group == "deepseek-r1-dp-tc":
+        return AMD_DEEPSEEK_R1_DP_TC_MODELS
+    elif group == "deepseek-r1-all":
+        # All DeepSeek-R1 variants: basic, MTP, DP, TC
+        return AMD_DEEPSEEK_R1_MODELS + AMD_DEEPSEEK_R1_DP_TC_MODELS
     elif group == "all":
         return (
             AMD_GPT_OSS_MODELS
             + AMD_GROK_MODELS
-            + AMD_DEEPSEEK_V3_DP_MODELS
-            + AMD_DEEPSEEK_V3_TC_MODELS
-            + AMD_DEEPSEEK_V3_MTP_MODELS
             + AMD_DEEPSEEK_R1_MODELS
+            + AMD_DEEPSEEK_R1_DP_TC_MODELS
         )
     else:
         print(f"[WARNING] Unknown model group '{group}', using 'gpt-oss'")
@@ -671,9 +681,10 @@ def test_gsm8k_completion_all_models(self):
         )
 
         for config in self.models:
-            with self.subTest(model=config.model_path):
+            display_name = config.get_display_name()
+            with self.subTest(model=display_name):
                 print(f"\n{'='*60}")
-                print(f"Testing: {config.model_path} (TP={config.tp_size})")
+                print(f"Testing: {display_name} (TP={config.tp_size})")
                 print(f"{'='*60}")
 
                 error_message = None
@@ -687,12 +698,12 @@ def test_gsm8k_completion_all_models(self):
 
                 if not is_available:
                     print(f"\n❌ MODEL NOT AVAILABLE: {status_msg}")
-                    print(f"⏭️ SKIPPING: {config.model_path}")
+                    print(f"⏭️ SKIPPING: {display_name}")
                     status = f"⏭️ SKIP"
                     skipped = True
                     all_results.append(
                         {
-                            "model": config.model_path,
+                            "model": display_name,
                             "tp_size": config.tp_size,
                             "accuracy": None,
                             "threshold": config.accuracy_threshold,
@@ -709,7 +720,7 @@ def test_gsm8k_completion_all_models(self):
                 else:
                     try:
                         # Launch server with timing
-                        print(f"\n🚀 Launching server for {config.model_path}...")
+                        print(f"\n🚀 Launching server for {display_name}...")
                         server_start = time.time()
                         process = popen_launch_server_for_base_model(
                             self.base_url, config
@@ -747,7 +758,7 @@ def test_gsm8k_completion_all_models(self):
 
                             total_time = time.time() - model_start
 
-                            print(f"\n📈 Results for {config.model_path}:")
+                            print(f"\n📈 Results for {display_name}:")
                             print(
                                 f"   Accuracy: {acc:.3f} (threshold: {config.accuracy_threshold})"
                             )
@@ -768,7 +779,7 @@ def test_gsm8k_completion_all_models(self):
 
                             all_results.append(
                                 {
-                                    "model": config.model_path,
+                                    "model": display_name,
                                     "tp_size": config.tp_size,
                                     "accuracy": acc,
                                     "threshold": config.accuracy_threshold,
@@ -790,7 +801,7 @@ def test_gsm8k_completion_all_models(self):
                             status = "❌ ERROR"
                             all_results.append(
                                 {
-                                    "model": config.model_path,
+                                    "model": display_name,
                                     "tp_size": config.tp_size,
                                     "accuracy": None,
                                     "threshold": config.accuracy_threshold,
@@ -806,7 +817,7 @@ def test_gsm8k_completion_all_models(self):
                             )
 
                         finally:
-                            print(f"\n🛑 Stopping server for {config.model_path}...")
+                            print(f"\n🛑 Stopping server for {display_name}...")
                             kill_process_tree(process.pid)
 
                     except Exception as e:
@@ -816,7 +827,7 @@ def test_gsm8k_completion_all_models(self):
                         status = "❌ ERROR"
                         all_results.append(
                             {
-                                "model": config.model_path,
+                                "model": display_name,
                                 "tp_size": config.tp_size,
                                 "accuracy": None,
                                 "threshold": config.accuracy_threshold,
@@ -831,14 +842,14 @@ def test_gsm8k_completion_all_models(self):
                             }
                         )
 
-                # Add to summary with runtime
+                # Add to summary with runtime (use display name to show variant)
                 acc_str = f"{acc:.3f}" if acc is not None else "N/A"
                 startup_str = (
                     f"{startup_time:.0f}s" if startup_time is not None else "N/A"
                 )
                 bench_str = f"{bench_time:.0f}s" if bench_time is not None else "N/A"
                 total_str = f"{total_time:.0f}s" if total_time is not None else "N/A"
-                summary += f"| {config.model_path} | {config.tp_size} | {acc_str} | {config.accuracy_threshold} | {startup_str} | {bench_str} | {total_str} | {status} |\n"
+                summary += f"| {display_name} | {config.tp_size} | {acc_str} | {config.accuracy_threshold} | {startup_str} | {bench_str} | {total_str} | {status} |\n"
 
         # Calculate total test runtime
         total_test_time = time.time() - total_test_start
diff --git a/test/registered/amd/nightly/test_gsm8k_completion_eval_mi35x.py b/test/registered/amd/nightly/test_gsm8k_completion_eval_mi35x.py
new file mode 100644
index 000000000000..87a95c023dd7
--- /dev/null
+++ b/test/registered/amd/nightly/test_gsm8k_completion_eval_mi35x.py
@@ -0,0 +1,726 @@
+"""
+MI35x GSM8K Completion Evaluation Test (8-GPU)
+
+This test uses the completion-based gsm8k benchmark (few-shot prompting)
+for MI35x-specific models that differ from MI300X configurations.
+
+MI35x-specific models:
+- GPT-OSS series: Uses openai/gpt-oss-* (not lmsys/gpt-oss-*-bf16)
+- DeepSeek-R1-0528: Same model as MI300X (MXFP4 only used for perf tests)
+
+Model groups are selected via AMD_TEST_MODEL_GROUP environment variable:
+- "gpt-oss" (default): GPT-OSS models with MI35x paths
+- "deepseek-r1": DeepSeek-R1-0528 basic + MTP (same as MI300X)
+- "deepseek-r1-dp-tc": DeepSeek-R1-0528 DP + TC (same as MI300X)
+- "deepseek-r1-all": All DeepSeek-R1-0528 variants (basic, MTP, DP, TC)
+
+Registry: nightly-amd-8-gpu-mi35x suite (8-GPU tests on MI35x)
+"""
+
+import ast
+import os
+
+# Set HF cache to /data2/models/ for MI35x so HF models download there
+os.environ.setdefault("HF_HOME", "/data2/models/huggingface")
+os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub")
+import re
+import subprocess
+import time
+import unittest
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import numpy as np
+
+# HuggingFace Hub for model cache checking and download progress
+try:
+    from huggingface_hub import HfFileSystem
+    from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
+
+    HF_HUB_AVAILABLE = True
+except ImportError:
+    HF_HUB_AVAILABLE = False
+    print("[WARNING] huggingface_hub not available - model cache checking disabled")
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_amd_ci
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+from sglang.utils import download_and_cache_file, read_jsonl
+
+# Register for AMD CI - MI35x 8-GPU GSM8K completion tests (~120 min)
+register_amd_ci(est_time=7200, suite="nightly-amd-8-gpu-mi35x", nightly=True)
+
+INVALID = -9999999
+
+
+@dataclass
+class BaseModelConfig:
+    """Configuration for a base model to test."""
+
+    model_path: str  # HuggingFace model ID (e.g., "amd/DeepSeek-R1-MXFP4-Preview")
+    tp_size: int = 8
+    accuracy_threshold: float = 0.50
+    other_args: Optional[List[str]] = None
+    env_vars: Optional[dict] = None
+    tokenizer_path: Optional[str] = None
+    timeout: Optional[int] = None
+    local_path: Optional[str] = None  # Preferred local path (checked first before HF)
+    variant: Optional[str] = (
+        None  # Test variant name (e.g., "basic", "MTP", "DP", "TC")
+    )
+
+    def __post_init__(self):
+        if self.other_args is None:
+            self.other_args = []
+        if self.env_vars is None:
+            self.env_vars = {}
+
+    def get_effective_model_path(self) -> str:
+        """Return local_path if it exists, otherwise model_path (HF ID)."""
+        if self.local_path and os.path.exists(self.local_path):
+            return self.local_path
+        return self.model_path
+
+    def get_display_name(self) -> str:
+        """Return display name for logs/summary (model + variant if set)."""
+        if self.variant:
+            return f"{self.model_path} ({self.variant})"
+        return self.model_path
+
+
+# =============================================================================
+# MI35x MODEL GROUPS - Different from MI300X configurations
+# =============================================================================
+
+# Group 1: GPT-OSS models (MI35x uses openai/* paths, not lmsys/*)
+MI35X_GPT_OSS_MODELS = [
+    # GPT-OSS-20B - MI35x specific path
+    BaseModelConfig(
+        model_path="openai/gpt-oss-20b",
+        tp_size=8,
+        accuracy_threshold=0.47,
+        other_args=[
+            "--chunked-prefill-size",
+            "130172",
+            "--max-running-requests",
+            "128",
+            "--mem-fraction-static",
+            "0.85",
+            "--attention-backend",
+            "triton",
+            "--trust-remote-code",
+        ],
+        env_vars={"SGLANG_USE_AITER": "1"},
+    ),
+    # GPT-OSS-120B - MI35x specific path
+    BaseModelConfig(
+        model_path="openai/gpt-oss-120b",
+        tp_size=8,
+        accuracy_threshold=0.79,
+        timeout=900,  # 15 minutes for 120B model
+        other_args=[
+            "--chunked-prefill-size",
+            "130172",
+            "--max-running-requests",
+            "128",
+            "--mem-fraction-static",
+            "0.85",
+            "--attention-backend",
+            "triton",
+            "--trust-remote-code",
+        ],
+        env_vars={"SGLANG_USE_AITER": "1"},
+    ),
+]
+
+# Group 2: DeepSeek-R1-0528 basic + MTP (same model as MI300X for consistency)
+# Runner: nightly-test-8-gpu-mi35x-deepseek-r1
+# Note: MXFP4 variant only used for perf tests (test_deepseek_r1_mxfp4_perf.py)
+MI35X_DEEPSEEK_R1_MODELS = [
+    # DeepSeek-R1-0528 basic - reasoning model, ~80GB per GPU
+    BaseModelConfig(
+        model_path="deepseek-ai/DeepSeek-R1-0528",
+        tp_size=8,
+        accuracy_threshold=0.93,
+        timeout=3600,  # 1 hour for large model
+        variant="basic",
+        other_args=[
+            "--attention-backend",
+            "aiter",
+            "--chunked-prefill-size",
+            "131072",
+            "--disable-radix-cache",
+            "--mem-fraction-static",
+            "0.85",
+            "--trust-remote-code",
+        ],
+        env_vars={
+            "SGLANG_USE_AITER": "1",
+        },
+    ),
+    # DeepSeek-R1-0528 with MTP (EAGLE speculative decoding)
+    BaseModelConfig(
+        model_path="deepseek-ai/DeepSeek-R1-0528",
+        tp_size=8,
+        accuracy_threshold=0.93,
+        timeout=3600,
+        variant="MTP",
+        other_args=[
+            "--chunked-prefill-size",
+            "131072",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "1",
+            "--speculative-num-draft-tokens",
+            "4",
+            "--mem-fraction-static",
+            "0.7",
+            "--trust-remote-code",
+        ],
+        env_vars={
+            "SGLANG_USE_AITER": "1",
+        },
+    ),
+]
+
+# Group 3: DeepSeek-R1-0528 with DP + TC (requires ROCm 7.0+)
+# Runner: nightly-test-8-gpu-mi35x-deepseek-r1-dp-tc
+MI35X_DEEPSEEK_R1_DP_TC_MODELS = [
+    # DeepSeek-R1-0528 with DP attention
+    BaseModelConfig(
+        model_path="deepseek-ai/DeepSeek-R1-0528",
+        tp_size=8,
+        accuracy_threshold=0.93,
+        timeout=3600,
+        variant="DP",
+        other_args=[
+            "--chunked-prefill-size",
+            "131072",
+            "--dp-size",
+            "8",
+            "--enable-dp-attention",
+            "--mem-fraction-static",
+            "0.85",
+            "--trust-remote-code",
+        ],
+        env_vars={
+            "SGLANG_USE_ROCM700A": "1",
+            "SGLANG_USE_AITER": "1",
+        },
+    ),
+    # DeepSeek-R1-0528 with torch compile
+    BaseModelConfig(
+        model_path="deepseek-ai/DeepSeek-R1-0528",
+        tp_size=8,
+        accuracy_threshold=0.93,
+        timeout=7200,  # 2 hours for compilation
+        variant="TC",
+        other_args=[
+            "--chunked-prefill-size",
+            "131072",
+            "--mem-fraction-static",
+            "0.70",
+            "--cuda-graph-max-bs",
+            "8",
+            "--enable-torch-compile",
+            "--disable-cuda-graph",
+            "--trust-remote-code",
+        ],
+        env_vars={
+            "SGLANG_USE_ROCM700A": "1",
+            "SGLANG_USE_AITER": "1",
+        },
+    ),
+]
+
+
+def get_model_group() -> str:
+    """Get the model group to test from environment variable."""
+    return os.environ.get("AMD_TEST_MODEL_GROUP", "gpt-oss")
+
+
+def get_models_for_group(group: str) -> List[BaseModelConfig]:
+    """Get the list of models for a given group.
+
+    Note: DeepSeek-R1-MXFP4 is only used for perf tests, not accuracy tests.
+    See test_deepseek_r1_mxfp4_perf.py for MXFP4 perf tests.
+    """
+    if group == "gpt-oss":
+        return MI35X_GPT_OSS_MODELS
+    elif group == "deepseek-r1":
+        return MI35X_DEEPSEEK_R1_MODELS
+    elif group == "deepseek-r1-dp-tc":
+        return MI35X_DEEPSEEK_R1_DP_TC_MODELS
+    elif group == "deepseek-r1-all":
+        # All DeepSeek-R1-0528 variants: basic, MTP, DP, TC
+        return MI35X_DEEPSEEK_R1_MODELS + MI35X_DEEPSEEK_R1_DP_TC_MODELS
+    elif group == "all":
+        return (
+            MI35X_GPT_OSS_MODELS
+            + MI35X_DEEPSEEK_R1_MODELS
+            + MI35X_DEEPSEEK_R1_DP_TC_MODELS
+        )
+    else:
+        print(f"[WARNING] Unknown model group '{group}', using 'gpt-oss'")
+        return MI35X_GPT_OSS_MODELS
+
+
+# =============================================================================
+# MODEL CACHE AND DOWNLOAD UTILITIES
+# =============================================================================
+
+
+def check_local_cache(model_path: str) -> Tuple[bool, str]:
+    """
+    Check if model is cached locally.
+
+    Returns:
+        Tuple of (is_cached, cache_path_or_message)
+    """
+    # Check common HF cache locations for MI35x
+    cache_dirs = [
+        os.path.expanduser("~/.cache/huggingface/hub"),
+        "/data2/models/huggingface/hub",
+        os.environ.get("HF_HUB_CACHE", ""),
+    ]
+    cache_dirs = [d for d in cache_dirs if d]  # Remove empty
+
+    # Convert model_path to cache directory format (org--model)
+    cache_name = f"models--{model_path.replace('/', '--')}"
+
+    for cache_dir in cache_dirs:
+        cache_path = os.path.join(cache_dir, cache_name)
+        if os.path.exists(cache_path):
+            # Check if there are snapshots
+            snapshots_dir = os.path.join(cache_path, "snapshots")
+            if os.path.exists(snapshots_dir) and os.listdir(snapshots_dir):
+                return True, cache_path
+
+    return False, f"Not found in: {', '.join(cache_dirs)}"
+
+
+def check_hf_repo_access(model_path: str) -> Tuple[bool, str]:
+    """
+    Check if HuggingFace repository is accessible.
+
+    Returns:
+        Tuple of (is_accessible, message)
+    """
+    if not HF_HUB_AVAILABLE:
+        return True, "huggingface_hub not available, skipping access check"
+
+    try:
+        fs = HfFileSystem()
+        # Try to list files in the repo
+        files = fs.ls(model_path, detail=False)
+        if files:
+            return True, f"Repository accessible ({len(files)} files)"
+        else:
+            return False, "Repository exists but is empty"
+    except GatedRepoError:
+        return False, "GATED REPO - requires authentication/approval"
+    except RepositoryNotFoundError:
+        return False, "REPO NOT FOUND on HuggingFace"
+    except Exception as e:
+        error_msg = str(e)
+        if "401" in error_msg or "unauthorized" in error_msg.lower():
+            return False, f"AUTH ERROR - may need HF_TOKEN: {error_msg[:100]}"
+        elif "404" in error_msg:
+            return False, f"NOT FOUND: {error_msg[:100]}"
+        elif "timeout" in error_msg.lower() or "connection" in error_msg.lower():
+            return False, f"NETWORK ERROR: {error_msg[:100]}"
+        else:
+            return False, f"ERROR: {error_msg[:100]}"
+
+
+def log_model_status(config: "BaseModelConfig") -> Tuple[bool, str]:
+    """
+    Log detailed model availability status.
+
+    Checks in order:
+    1. local_path (if specified) - preferred local path
+    2. model_path as local path (if starts with /)
+    3. model_path as HF model ID - check cache then HF access
+
+    Returns:
+        Tuple of (is_available, status_message)
+    """
+    model_path = config.model_path
+    local_path = config.local_path
+
+    print(f"\n📦 Checking model: {model_path}")
+    if local_path:
+        print(f"   (preferred local: {local_path})")
+    print("-" * 50)
+
+    # Step 1: Check preferred local_path first (if specified)
+    if local_path:
+        if os.path.exists(local_path):
+            print(f"  ✅ LOCAL PATH: Found at {local_path}")
+            return True, f"Local path exists at {local_path}"
+        else:
+            print(f"  ⚠️  LOCAL PATH: Not found at {local_path}, trying HF fallback...")
+
+    # Step 2: For absolute paths (starting with /), check if exists
+    if model_path.startswith("/"):
+        if os.path.exists(model_path):
+            print(f"  ✅ LOCAL PATH: Found at {model_path}")
+            return True, f"Local path exists at {model_path}"
+        else:
+            print(f"  ❌ LOCAL PATH: Not found at {model_path}")
+            return False, f"Local path not found at {model_path}"
+
+    # Step 3: For HF model IDs, check local cache first
+    is_cached, cache_msg = check_local_cache(model_path)
+    if is_cached:
+        print(f"  ✅ LOCAL CACHE: Found at {cache_msg}")
+        return True, f"Cached locally at {cache_msg}"
+    else:
+        print(f"  ⚠️  LOCAL CACHE: {cache_msg}")
+
+    # Step 4: Check HF repo access (will download if accessible)
+    is_accessible, access_msg = check_hf_repo_access(model_path)
+    if is_accessible:
+        print(f"  ✅ HF ACCESS: {access_msg}")
+        print(
+            f"  📥 Model will be downloaded from HuggingFace to {os.environ.get('HF_HOME', '~/.cache/huggingface')}"
+        )
+        return True, f"Will download from HF: {access_msg}"
+    else:
+        print(f"  ❌ HF ACCESS: {access_msg}")
+        return False, access_msg
+
+
+# =============================================================================
+# BENCHMARK UTILITIES
+# =============================================================================
+
+
+def get_one_example(lines, i, include_answer):
+    """Format a single GSM8K example."""
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+
+
+def get_few_shot_examples(lines, k):
+    """Get k few-shot examples for prompting."""
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+
+
+def get_answer_value(answer_str):
+    """Extract numerical answer from response."""
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+
+
+def run_gsm8k_benchmark(
+    base_url: str,
+    num_questions: int = 200,
+    num_shots: int = 5,
+    parallel: int = 64,
+) -> Tuple[float, float, float]:
+    """Run GSM8K few-shot completion benchmark."""
+    import sglang as sgl
+    from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
+
+    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    data_path = download_and_cache_file(url)
+    lines = list(read_jsonl(data_path))
+
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+
+    @sgl.function
+    def few_shot_gsm8k(s, question):
+        s += few_shot_examples + question
+        s += sgl.gen(
+            "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"]
+        )
+
+    backend = RuntimeEndpoint(base_url)
+    sgl.set_default_backend(backend)
+
+    tic = time.perf_counter()
+    states = few_shot_gsm8k.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=parallel,
+        progress_bar=True,
+    )
+    latency = time.perf_counter() - tic
+
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]["answer"]))
+
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+
+    return float(acc), float(invalid), float(latency)
+
+
+def popen_launch_server_for_base_model(
+    base_url: str,
+    config: BaseModelConfig,
+) -> "subprocess.Popen":
+    """Launch server for a base model with appropriate configuration."""
+    env = os.environ.copy()
+    for key, value in config.env_vars.items():
+        env[key] = value
+        print(f"Setting env: {key}={value}")
+
+    other_args = list(config.other_args)
+    other_args.extend(["--tp", str(config.tp_size)])
+    other_args.extend(["--log-level-http", "warning"])
+
+    if config.tokenizer_path:
+        other_args.extend(["--tokenizer-path", config.tokenizer_path])
+
+    timeout = config.timeout if config.timeout else DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+
+    # Use effective model path (local if exists, else HF model ID)
+    effective_model_path = config.get_effective_model_path()
+    print(f"Using model path: {effective_model_path}")
+
+    process = popen_launch_server(
+        model=effective_model_path,
+        base_url=base_url,
+        timeout=timeout,
+        other_args=other_args,
+        env=env,
+    )
+    return process
+
+
+class TestMI35xGsm8kCompletionEval(unittest.TestCase):
+    """MI35x GSM8K Completion Evaluation Test (8-GPU)
+
+    Tests MI35x-specific base models using few-shot completion benchmark.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model_group = get_model_group()
+        cls.models = get_models_for_group(cls.model_group)
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.num_questions = int(os.environ.get("GSM8K_NUM_QUESTIONS", "200"))
+
+        print(f"\n{'='*60}")
+        print(f"MI35x GSM8K Completion Evaluation Test (8-GPU)")
+        print(f"{'='*60}")
+        print(f"Model group: {cls.model_group}")
+        print(f"Models to test: {len(cls.models)}")
+        for m in cls.models:
+            print(f"  - {m.model_path}")
+        print(f"Questions per model: {cls.num_questions}")
+        print(f"{'='*60}\n")
+
+    def test_gsm8k_completion_all_models(self):
+        """Test all configured MI35x models with GSM8K completion benchmark."""
+        all_results = []
+        total_test_start = time.time()
+
+        summary = f"### MI35x Model Group: {self.model_group}\n\n"
+        summary += (
+            "| Model | TP | Accuracy | Threshold | Startup | Bench | Total | Status |\n"
+        )
+        summary += (
+            "| ----- | -- | -------- | --------- | ------- | ----- | ----- | ------ |\n"
+        )
+
+        for config in self.models:
+            display_name = config.get_display_name()
+            with self.subTest(model=display_name):
+                print(f"\n{'='*60}")
+                print(f"Testing: {display_name} (TP={config.tp_size})")
+                print(f"{'='*60}")
+
+                error_message = None
+                acc, invalid, latency = None, None, None
+                startup_time, bench_time, total_time = None, None, None
+                model_start = time.time()
+
+                # Check model availability with detailed logging
+                is_available, status_msg = log_model_status(config)
+
+                if not is_available:
+                    print(f"\n❌ MODEL NOT AVAILABLE: {status_msg}")
+                    print(f"⏭️ SKIPPING: {display_name}")
+                    status = "⏭️ SKIP"
+                    all_results.append(
+                        {
+                            "model": display_name,
+                            "tp_size": config.tp_size,
+                            "accuracy": None,
+                            "threshold": config.accuracy_threshold,
+                            "passed": True,
+                            "skipped": True,
+                            "error": status_msg,
+                        }
+                    )
+                else:
+                    try:
+                        print(f"\n🚀 Launching server for {display_name}...")
+                        server_start = time.time()
+                        process = popen_launch_server_for_base_model(
+                            self.base_url, config
+                        )
+                        startup_time = time.time() - server_start
+                        print(f"⏱️  Server startup: {startup_time:.1f}s")
+
+                        try:
+                            print(
+                                f"📊 Running GSM8K benchmark ({self.num_questions} questions)..."
+                            )
+                            bench_start = time.time()
+                            for attempt in range(3):
+                                try:
+                                    acc, invalid, latency = run_gsm8k_benchmark(
+                                        self.base_url,
+                                        num_questions=self.num_questions,
+                                        num_shots=5,
+                                        parallel=64,
+                                    )
+                                    print(
+                                        f"   Attempt {attempt + 1}: accuracy={acc:.3f}"
+                                    )
+                                    if acc >= config.accuracy_threshold:
+                                        break
+                                except Exception as e:
+                                    print(f"   Attempt {attempt + 1} failed: {e}")
+                                    if attempt == 2:
+                                        raise
+                            bench_time = time.time() - bench_start
+                            total_time = time.time() - model_start
+
+                            passed = acc >= config.accuracy_threshold
+                            status = "✅ PASS" if passed else "❌ FAIL"
+
+                            print(
+                                f"\n📈 Results: accuracy={acc:.3f} (threshold: {config.accuracy_threshold})"
+                            )
+                            print(f"⏱️  Total: {total_time:.1f}s")
+
+                            all_results.append(
+                                {
+                                    "model": display_name,
+                                    "tp_size": config.tp_size,
+                                    "accuracy": acc,
+                                    "threshold": config.accuracy_threshold,
+                                    "startup_time": startup_time,
+                                    "bench_time": bench_time,
+                                    "total_time": total_time,
+                                    "passed": passed,
+                                    "skipped": False,
+                                    "error": None,
+                                }
+                            )
+
+                        except Exception as e:
+                            error_message = str(e)
+                            total_time = time.time() - model_start
+                            print(f"\n❌ Error: {error_message}")
+                            status = "❌ ERROR"
+                            all_results.append(
+                                {
+                                    "model": display_name,
+                                    "tp_size": config.tp_size,
+                                    "accuracy": None,
+                                    "threshold": config.accuracy_threshold,
+                                    "passed": False,
+                                    "skipped": False,
+                                    "error": error_message,
+                                }
+                            )
+
+                        finally:
+                            print(f"\n🛑 Stopping server...")
+                            kill_process_tree(process.pid)
+
+                    except Exception as e:
+                        error_message = str(e)
+                        total_time = time.time() - model_start
+                        print(f"\n❌ Error launching server: {error_message}")
+                        status = "❌ ERROR"
+                        all_results.append(
+                            {
+                                "model": display_name,
+                                "tp_size": config.tp_size,
+                                "accuracy": None,
+                                "threshold": config.accuracy_threshold,
+                                "passed": False,
+                                "skipped": False,
+                                "error": error_message,
+                            }
+                        )
+
+                # Add to summary (use display name to show variant)
+                acc_str = f"{acc:.3f}" if acc is not None else "N/A"
+                startup_str = (
+                    f"{startup_time:.0f}s" if startup_time is not None else "N/A"
+                )
+                bench_str = f"{bench_time:.0f}s" if bench_time is not None else "N/A"
+                total_str = f"{total_time:.0f}s" if total_time is not None else "N/A"
+                summary += f"| {display_name} | {config.tp_size} | {acc_str} | {config.accuracy_threshold} | {startup_str} | {bench_str} | {total_str} | {status} |\n"
+
+        # Final summary
+        total_test_time = time.time() - total_test_start
+        failed_models = [
+            r for r in all_results if not r["passed"] and not r.get("skipped", False)
+        ]
+        skipped_models = [r for r in all_results if r.get("skipped", False)]
+        passed_models = [
+            r for r in all_results if r["passed"] and not r.get("skipped", False)
+        ]
+
+        print(f"\n{'='*60}")
+        print(f"SUMMARY - MI35x Model Group: {self.model_group}")
+        print(f"{'='*60}")
+        print(summary)
+        print(
+            f"\n📊 Passed: {len(passed_models)} | Failed: {len(failed_models)} | Skipped: {len(skipped_models)}"
+        )
+        print(f"⏱️  Total: {total_test_time:.1f}s ({total_test_time/60:.1f} min)")
+
+        if is_in_ci():
+            write_github_step_summary(summary)
+
+        if failed_models:
+            failure_msg = "\n".join(
+                [
+                    f"- {r['model']}: {r.get('error', 'below threshold')}"
+                    for r in failed_models
+                ]
+            )
+            raise AssertionError(f"The following models failed:\n{failure_msg}")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/srt/nightly/test_gsm8k_eval_amd.py b/test/registered/amd/nightly/test_gsm8k_eval_amd.py
similarity index 96%
rename from test/srt/nightly/test_gsm8k_eval_amd.py
rename to test/registered/amd/nightly/test_gsm8k_eval_amd.py
index 5a94a7780b35..5918c6e6e1f6 100644
--- a/test/srt/nightly/test_gsm8k_eval_amd.py
+++ b/test/registered/amd/nightly/test_gsm8k_eval_amd.py
@@ -1,3 +1,12 @@
+"""
+AMD GSM8K Evaluation Test (Migrated from test/srt/nightly/)
+
+This test evaluates instruction-tuned models on the mgsm_en benchmark using chat completions.
+Models are tested with various TP configurations on AMD GPUs.
+
+Registry: nightly-amd suite (2-GPU tests)
+"""
+
 import json
 import os
 import time
@@ -6,6 +15,7 @@
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_amd_ci
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
@@ -21,6 +31,9 @@
     write_results_to_json,
 )
 
+# Register for AMD CI - GSM8K evaluation tests (~60 min)
+register_amd_ci(est_time=3600, suite="nightly-amd", nightly=True)
+
 MODEL_SCORE_THRESHOLDS = {
     # Llama 3.1 series
     "meta-llama/Llama-3.1-8B-Instruct": 0.82,
diff --git a/test/srt/nightly/test_vlms_mmmu_eval_amd.py b/test/registered/amd/nightly/test_vlms_mmmu_eval_amd.py
similarity index 97%
rename from test/srt/nightly/test_vlms_mmmu_eval_amd.py
rename to test/registered/amd/nightly/test_vlms_mmmu_eval_amd.py
index 1df3b15bc437..d9438e6ca602 100644
--- a/test/srt/nightly/test_vlms_mmmu_eval_amd.py
+++ b/test/registered/amd/nightly/test_vlms_mmmu_eval_amd.py
@@ -1,5 +1,5 @@
 """
-AMD VLM MMMU Evaluation Test
+AMD VLM MMMU Evaluation Test (Migrated from test/srt/nightly/)
 
 This test evaluates Vision-Language Models (VLMs) on the MMMU benchmark on AMD GPUs.
 Models are selected based on compatibility with AMD/ROCm platform.
@@ -11,6 +11,8 @@
 - deepseek-vl2-small
 
 Note: Some VLMs from the Nvidia test are excluded due to AMD compatibility issues.
+
+Registry: nightly-amd-vlm suite (2-GPU VLM tests)
 """
 
 import os
@@ -20,6 +22,7 @@
 from types import SimpleNamespace
 
 from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_amd_ci
 from sglang.test.run_eval import run_eval
 from sglang.test.test_utils import (
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -30,6 +33,9 @@
     write_results_to_json,
 )
 
+# Register for AMD CI - VLM MMMU evaluation tests (~120 min)
+register_amd_ci(est_time=7200, suite="nightly-amd-vlm", nightly=True)
+
 # AMD-verified VLM models with conservative thresholds on 100 MMMU samples
 # Format: (model_path, tp_size, accuracy_threshold, extra_args)
 AMD_VLM_MODELS = [
diff --git a/test/registered/amd/test_deepseek_r1_mxfp4_perf.py b/test/registered/amd/test_deepseek_r1_mxfp4_perf.py
new file mode 100644
index 000000000000..e1a0de107d92
--- /dev/null
+++ b/test/registered/amd/test_deepseek_r1_mxfp4_perf.py
@@ -0,0 +1,166 @@
+"""Nightly performance benchmark for DeepSeek-R1-MXFP4 model (MI35x).
+
+This test benchmarks the DeepSeek-R1-MXFP4 quantized model on MI35x with 8 GPUs.
+
+The model path can be configured via DEEPSEEK_R1_MXFP4_MODEL_PATH environment variable.
+
+Example usage:
+    DEEPSEEK_R1_MXFP4_MODEL_PATH=/data2/models/amd-DeepSeek-R1-MXFP4-Preview python -m pytest test_deepseek_r1_mxfp4_perf.py -v
+"""
+
+import os
+
+# Set HF cache to /data2/models/ for MI35x so HF models download there
+os.environ.setdefault("HF_HOME", "/data2/models/huggingface")
+os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub")
+import unittest
+from typing import List
+
+from sglang.test.ci.ci_register import register_amd_ci
+from sglang.test.nightly_bench_utils import BenchmarkResult
+from sglang.test.nightly_utils import NightlyBenchmarkRunner
+from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env
+
+# Register for AMD CI - DeepSeek-R1-MXFP4 benchmark (~300 min)
+register_amd_ci(
+    est_time=18000, suite="nightly-perf-8-gpu-deepseek-r1-mxfp4", nightly=True
+)
+
+
+def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str:
+    """Generate a simplified markdown report without traces and cost columns."""
+    model_header = results[0].model_path
+    if results[0].run_name and results[0].run_name != "default":
+        model_header += f" ({results[0].run_name})"
+
+    gpu_config = os.getenv("GPU_CONFIG", "")
+    if gpu_config:
+        model_header += f" [{gpu_config}]"
+
+    summary = f"### {model_header}\n"
+    summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n"
+    summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n"
+
+    for result in results:
+        itl = 1 / (result.output_throughput / result.batch_size) * 1000
+        summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n"
+
+    return summary
+
+
+# Model path configuration for MI35x DeepSeek-R1-MXFP4
+# Priority: 1) env var, 2) local path, 3) HuggingFace model ID
+DEEPSEEK_R1_MXFP4_LOCAL_PATH = "/data2/models/amd-DeepSeek-R1-MXFP4-Preview"
+DEEPSEEK_R1_MXFP4_HF_MODEL_ID = "amd/DeepSeek-R1-MXFP4-Preview"
+PROFILE_DIR = "performance_profiles_deepseek_r1_mxfp4"
+
+
+def get_model_path() -> str:
+    """Get effective model path: env var > local path > HF model ID."""
+    # Check env var first
+    env_path = os.environ.get("DEEPSEEK_R1_MXFP4_MODEL_PATH")
+    if env_path:
+        return env_path
+    # Check local path
+    if os.path.exists(DEEPSEEK_R1_MXFP4_LOCAL_PATH):
+        return DEEPSEEK_R1_MXFP4_LOCAL_PATH
+    # Fall back to HF model ID
+    return DEEPSEEK_R1_MXFP4_HF_MODEL_ID
+
+
+class TestNightlyDeepseekR1MXFP4Performance(unittest.TestCase):
+    """Nightly performance benchmark for DeepSeek-R1-MXFP4 model (MI35x).
+
+    Tests the DeepSeek-R1-MXFP4 quantized model on TP=8 with DP=8.
+    Uses local path if available, otherwise downloads from HuggingFace.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        cls.model = get_model_path()
+        print(f"Using model path: {cls.model}")
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.batch_sizes = [1, 1, 8, 16, 64]
+        cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
+        cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512"))
+
+        # Define variant configurations for DeepSeek-R1-MXFP4 on MI35x
+        # Only run basic variant for perf (DP/TC/MTP covered in accuracy tests)
+        cls.variants = [
+            {
+                "name": "basic",
+                "other_args": [
+                    "--trust-remote-code",
+                    "--tp",
+                    "8",
+                    "--chunked-prefill-size",
+                    "131072",
+                    "--disable-radix-cache",
+                    "--mem-fraction-static",
+                    "0.85",
+                ],
+            },
+        ]
+
+        cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url)
+        cls.runner.setup_profile_directory()
+        # Override full_report to remove traces help text
+        cls.runner.full_report = f"## {cls.__name__}\n"
+
+    def test_bench_one_batch(self):
+        """Run benchmark across all configured variants."""
+        failed_variants = []
+
+        # For local paths, check if exists. HF model IDs will download automatically.
+        is_local_path = self.model.startswith("/")
+        if is_local_path and not os.path.exists(self.model):
+            print(f"\n⏭️ SKIPPING: Local model not found at {self.model}")
+            self.runner.full_report += (
+                f"\n⏭️ Test skipped: Local model not found at {self.model}\n"
+            )
+            self.runner.write_final_report()
+            return
+
+        # Log model source
+        if is_local_path:
+            print(f"📁 Using local model: {self.model}")
+        else:
+            print(
+                f"📥 Using HuggingFace model: {self.model} (will download if not cached)"
+            )
+
+        try:
+            for variant_config in self.variants:
+                with self.subTest(variant=variant_config["name"]):
+                    result_tuple = self.runner.run_benchmark_for_model(
+                        model_path=self.model,
+                        batch_sizes=self.batch_sizes,
+                        input_lens=self.input_lens,
+                        output_lens=self.output_lens,
+                        other_args=variant_config["other_args"],
+                        variant=variant_config["name"],
+                        extra_bench_args=["--trust-remote-code"],
+                    )
+                    results = result_tuple[0]
+                    success = result_tuple[1]
+
+                    if not success:
+                        failed_variants.append(variant_config["name"])
+
+                    # Use simplified report format without traces
+                    if results:
+                        self.runner.full_report += (
+                            generate_simple_markdown_report(results) + "\n"
+                        )
+        finally:
+            self.runner.write_final_report()
+
+        if failed_variants:
+            raise AssertionError(
+                f"Benchmark failed for {self.model} with the following variants: "
+                f"{', '.join(failed_variants)}"
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/run_suite.py b/test/run_suite.py
index e2944781cb84..1a687b0f864e 100644
--- a/test/run_suite.py
+++ b/test/run_suite.py
@@ -56,7 +56,13 @@
         "nightly-perf-text-2-gpu",
         "nightly-perf-vlm-2-gpu",
     ],
-    HWBackend.AMD: ["nightly-amd", "nightly-amd-8-gpu"],
+    HWBackend.AMD: [
+        "nightly-amd",
+        "nightly-amd-8-gpu",
+        "nightly-amd-vlm",
+        # MI35x 8-GPU suite (different model configs)
+        "nightly-amd-8-gpu-mi35x",
+    ],
     HWBackend.CPU: [],
     HWBackend.NPU: [
         "nightly-1-npu-a3",
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 958c939b95d1..edd33cd18c85 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -115,17 +115,9 @@
     "per-commit-8-gpu-amd-mi35x": [
         TestFile("test_deepseek_r1_mxfp4_8gpu.py", 3600),
     ],
-    "nightly-amd": [
-        TestFile("nightly/test_gsm8k_eval_amd.py"),
-    ],
-    # AMD VLM tests using MMMU benchmark (2-GPU runner)
-    "nightly-amd-vlm": [
-        TestFile("nightly/test_vlms_mmmu_eval_amd.py"),
-    ],
-    # AMD 8-GPU tests for base models using gsm8k completion benchmark
-    "nightly-amd-8-gpu": [
-        TestFile("nightly/test_gsm8k_completion_eval_amd.py"),
-    ],
+    # NOTE: AMD nightly suites (nightly-amd, nightly-amd-vlm, nightly-amd-8-gpu)
+    # have been migrated to test/registered/amd/nightly/ and are now managed
+    # by test/run_suite.py using the registry system.
 }
 
 # Add Intel Xeon tests