diff --git a/.github/workflows/nightly-test-amd-rocm720.yml b/.github/workflows/nightly-test-amd-rocm720.yml
new file mode 100644
index 000000000000..1d0a55d6d599
--- /dev/null
+++ b/.github/workflows/nightly-test-amd-rocm720.yml
@@ -0,0 +1,868 @@
+name: Nightly Test (AMD ROCm 7.2)
+
+on:
+  schedule:
+    - cron: '0 2 * * *'
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+    inputs:
+      job_filter:
+        description: 'Select which job to run (leave empty or "all" to run all jobs)'
+        required: false
+        type: choice
+        default: 'all'
+        options:
+          - 'all'
+          # MI30x ROCm 7.2 Unit Tests
+          - 'nightly-test-1-gpu-unit-rocm720'
+          # MI30x ROCm 7.2 Accuracy Tests (GSM8K / MMMU)
+          - 'nightly-accuracy-2-gpu-rocm720'
+          - 'nightly-accuracy-2-gpu-vlm-rocm720'
+          - 'nightly-perf-2-gpu-text-rocm720'
+          - 'nightly-perf-2-gpu-vlm-rocm720'
+          - 'nightly-accuracy-8-gpu-rocm720'
+          # MI30x ROCm 7.2 Accuracy + Performance Tests (combined)
+          - 'nightly-8-gpu-grok1-int4-rocm720'
+          - 'nightly-8-gpu-grok2-rocm720'
+          - 'nightly-8-gpu-deepseek-v31-rocm720'
+          - 'nightly-8-gpu-deepseek-v32-rocm720'
+          - 'nightly-8-gpu-deepseek-v32-mtp-rocm720'
+          - 'nightly-8-gpu-kimi-k2-rocm720'
+          # MI35x ROCm 7.2 jobs
+          - 'nightly-test-1-gpu-mi35x-rocm720'
+          - 'nightly-accuracy-8-gpu-mi35x-rocm720'
+          - 'nightly-8-gpu-mi35x-grok1-int4-rocm720'
+          - 'nightly-8-gpu-mi35x-grok2-rocm720'
+          - 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720'
+          - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720'
+          - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720'
+          - 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720'
+          - 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720'
+  workflow_call:
+    inputs:
+      ref:
+        description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
+        required: false
+        type: string
+        default: ''
+      job_filter:
+        description: 'Select which job to run (leave empty or "all" to run all jobs)'
+        required: false
+        type: string
+        default: 'all'
+
+concurrency:
+  group: nightly-test-amd-rocm720-${{ inputs.ref || github.ref }}
+  cancel-in-progress: ${{ github.event_name != 'workflow_call' }}
+
+jobs:
+  # ============================================== MI30x ROCm 7.2 Unit Tests ==============================================
+  # 1-GPU Unit Tests - LoRA, debug utils, scheduler, etc. (MI30x ROCm 7.2)
+  nightly-test-1-gpu-unit-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-unit-rocm720')
+    runs-on: linux-mi325-gpu-1
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Nightly Unit Test ROCm 7.2 (1-GPU)
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # ============================================== MI30x ROCm 7.2 Accuracy Tests ==============================================
+  # 2-GPU Accuracy Tests - GSM8K eval (MI30x ROCm 7.2)
+  nightly-accuracy-2-gpu-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-rocm720')
+    runs-on: linux-mi325-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Nightly Test ROCm 7.2 (2-GPU)
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 2-GPU VLM Accuracy Tests - Vision-Language Models MMMU evaluation (ROCm 7.2)
+  nightly-accuracy-2-gpu-vlm-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-vlm-rocm720')
+    runs-on: linux-mi325-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Nightly Accuracy Test ROCm 7.2 (2-GPU VLM MMMU)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 2-GPU Text Models Performance Tests (ROCm 7.2)
+  nightly-perf-2-gpu-text-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-text-rocm720')
+    runs-on: linux-mi325-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Performance Test ROCm 7.2 (2-GPU Text Models)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e SGLANG_USE_AITER=1 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 2-GPU VLM Performance Tests (ROCm 7.2)
+  nightly-perf-2-gpu-vlm-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-vlm-rocm720')
+    runs-on: linux-mi325-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Performance Test ROCm 7.2 (2-GPU VLM Models)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e SGLANG_USE_AITER=1 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU Accuracy Tests - GPT-OSS, Grok1-FP8 (ROCm 7.2)
+  nightly-accuracy-8-gpu-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-rocm720')
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU GPT-OSS)
+        timeout-minutes: 180
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU Grok1-FP8)
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # ============================================== MI30x ROCm 7.2 Combined Accuracy + Performance Tests ==============================================
+  # 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-grok1-int4-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok1-int4-rocm720')
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU Grok1-INT4)
+        timeout-minutes: 60
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test ROCm 7.2 (8-GPU Grok1-INT4)
+        timeout-minutes: 60
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-grok2-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok2-rocm720')
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU Grok2)
+        timeout-minutes: 60
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test ROCm 7.2 (8-GPU Grok2)
+        timeout-minutes: 60
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU DeepSeek-V3.1 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-deepseek-v31-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720')
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.1)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e SGLANG_USE_AITER=1 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.1)
+        timeout-minutes: 300
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e SGLANG_USE_ROCM700A=1 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU DeepSeek-V3.2 (Basic Accuracy + Perf) ROCm 7.2
+  nightly-8-gpu-deepseek-v32-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32-rocm720')
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic)
+        timeout-minutes: 150
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU DeepSeek-V3.2 MTP (MTP Accuracy + Perf) ROCm 7.2
+  nightly-8-gpu-deepseek-v32-mtp-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32-mtp-rocm720')
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP)
+        timeout-minutes: 180
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU Kimi-K2 (Accuracy + Speed) ROCm 7.2
+  nightly-8-gpu-kimi-k2-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-kimi-k2-rocm720')
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU Kimi-K2)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k2 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # ============================================== MI35x ROCm 7.2 Tests ==============================================
+  # MI35x 1-GPU ROCm 7.2 tests
+  nightly-test-1-gpu-mi35x-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-mi35x-rocm720')
+    runs-on: linux-mi35x-gpu-1
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Nightly Test MI35x ROCm 7.2 (1-GPU)
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU Accuracy Tests - GPT-OSS (ROCm 7.2)
+  nightly-accuracy-8-gpu-mi35x-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-rocm720')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU GPT-OSS)
+        timeout-minutes: 180
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-mi35x-grok1-int4-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok1-int4-rocm720')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU Grok1-INT4)
+        timeout-minutes: 60
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU Grok1-INT4)
+        timeout-minutes: 60
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-mi35x-grok2-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok2-rocm720')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU Grok2)
+        timeout-minutes: 60
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU Grok2)
+        timeout-minutes: 60
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-R1-MXFP4 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4)
+        timeout-minutes: 300
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_perf_mi35x.py || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-V3.2 Accuracy Test (ROCm 7.2)
+  nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-V3.2 TP+MTP Accuracy Test (ROCm 7.2)
+  nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2 TP+MTP)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-V3.2 Performance Test (Basic) ROCm 7.2
+  nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic)
+        timeout-minutes: 150
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP) ROCm 7.2
+  nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  check-all-jobs:
+    if: always() && (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch')
+    needs:
+      # MI30x ROCm 7.2 Unit Tests
+      - nightly-test-1-gpu-unit-rocm720
+      # MI30x ROCm 7.2 Accuracy Tests
+      - nightly-accuracy-2-gpu-rocm720
+      - nightly-accuracy-2-gpu-vlm-rocm720
+      # MI30x ROCm 7.2 Performance Tests
+      - nightly-perf-2-gpu-text-rocm720
+      - nightly-perf-2-gpu-vlm-rocm720
+      - nightly-accuracy-8-gpu-rocm720
+      # MI30x ROCm 7.2 Combined Accuracy + Performance Tests
+      - nightly-8-gpu-grok1-int4-rocm720
+      - nightly-8-gpu-grok2-rocm720
+      - nightly-8-gpu-deepseek-v31-rocm720
+      - nightly-8-gpu-deepseek-v32-rocm720
+      - nightly-8-gpu-deepseek-v32-mtp-rocm720
+      - nightly-8-gpu-kimi-k2-rocm720
+      # MI35x ROCm 7.2 jobs
+      - nightly-test-1-gpu-mi35x-rocm720
+      - nightly-accuracy-8-gpu-mi35x-rocm720
+      - nightly-8-gpu-mi35x-grok1-int4-rocm720
+      - nightly-8-gpu-mi35x-grok2-rocm720
+      - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720
+      - nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720
+      - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720
+      - nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720
+      - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check if any job failed
+        run: |
+          if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then
+            echo "One or more ROCm 7.2 nightly test jobs failed"
+            exit 1
+          fi
+          if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
+            echo "One or more ROCm 7.2 nightly test jobs were cancelled"
+            exit 1
+          fi
+          echo "All ROCm 7.2 nightly test jobs passed"
diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml
new file mode 100644
index 000000000000..d47168a187aa
--- /dev/null
+++ b/.github/workflows/pr-test-amd-rocm720.yml
@@ -0,0 +1,793 @@
+name: PR Test ROCm 7.2 (AMD)
+# Dynamic run-name for /rerun-stage commands to enable URL lookup
+# Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs
+run-name: ${{ inputs.target_stage && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage)) || '' }}
+
+on:
+  # run rocm 720 pr tests once a day at 2am UTC to avoid overwhelming the CI system
+  schedule:
+    - cron: '0 2 * * *'
+  # push:
+  #   branches: [ main ]
+  #   paths:
+  #     - "python/**"
+  #     - "scripts/ci/**"
+  #     - "test/**"
+  #     - "sgl-kernel/**"
+  #     - ".github/workflows/pr-test-amd-rocm720.yml"
+  #     - "docker/rocm720.Dockerfile"
+  # pull_request:
+  #   branches: [ main ]
+  #   paths:
+  #     - "python/**"
+  #     - "scripts/ci/**"
+  #     - "test/**"
+  #     - "sgl-kernel/**"
+  #     - ".github/workflows/pr-test-amd-rocm720.yml"
+  #     - "docker/rocm720.Dockerfile"
+  workflow_dispatch:
+    inputs:
+      target_stage:
+        description: "Specific stage to run (optional, for quick testing)"
+        required: false
+        type: string
+        default: ""
+      pr_head_sha:
+        description: "PR head SHA to checkout (for /rerun-stage on fork PRs)"
+        required: false
+        type: string
+        default: ""
+  workflow_call:
+    inputs:
+      ref:
+        description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
+        required: false
+        type: string
+        default: ''
+      run_all_tests:
+        description: "Run all tests (for releasing or testing purpose)"
+        required: false
+        type: boolean
+        default: false
+
+concurrency:
+  # Include pr_head_sha in group for /rerun-stage dispatches to avoid collisions with main branch runs
+  group: pr-test-amd-rocm720-${{ inputs.pr_head_sha || inputs.ref || github.ref }}
+  cancel-in-progress: ${{ github.event_name != 'workflow_call' }}
+
+jobs:
+  call-gate:
+    uses: ./.github/workflows/pr-gate.yml
+    secrets: inherit
+  check-changes:
+    needs: [call-gate]
+    runs-on: ubuntu-latest
+    outputs:
+      main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }}
+      sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }}
+      multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Determine run mode
+        id: run-mode
+        run: |
+          # Run all tests for workflow_call (when ref input is provided)
+          # Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref
+          if [[ "${{ inputs.run_all_tests }}" == "true" ]]; then
+            echo "run_all_tests=true" >> $GITHUB_OUTPUT
+            echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }})"
+          else
+            echo "run_all_tests=false" >> $GITHUB_OUTPUT
+            echo "Run mode: FILTERED (triggered by ${{ github.event_name }})"
+          fi
+
+      - name: Detect file changes
+        id: filter
+        uses: dorny/paths-filter@v3
+        if: steps.run-mode.outputs.run_all_tests != 'true'
+        with:
+          filters: |
+            main_package:
+              - "python/sglang/!(multimodal_gen)/**"
+              - "python/pyproject_rocm.toml"
+              - "python/pyproject_other.toml"
+              - "scripts/ci/amd/*"
+              - "scripts/ci/utils/*"
+              - "test/**"
+              - ".github/workflows/pr-test-amd-rocm720.yml"
+            sgl_kernel:
+              - "sgl-kernel/**"
+              - ".github/workflows/pr-test-amd-rocm720.yml"
+            multimodal_gen:
+              - "python/sglang/multimodal_gen/**"
+              - "python/sglang/cli/**"
+              - "python/pyproject_rocm.toml"
+              - "python/pyproject_other.toml"
+
+  # =============================================== sgl-kernel ====================================================
+  sgl-kernel-unit-test-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'sgl-kernel-unit-test-amd') ||
+        (
+          !inputs.target_stage &&
+          needs.check-changes.outputs.sgl_kernel == 'true'
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Run test
+        timeout-minutes: 14
+        run: |
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py
+
+  sgl-kernel-unit-test-2-gpu-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'sgl-kernel-unit-test-2-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          needs.check-changes.outputs.sgl_kernel == 'true'
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_deterministic_custom_allreduce.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_nccl_allreduce_determinism.py
+
+  # =============================================== primary ====================================================
+
+  stage-a-test-1-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-a-test-1-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Run test
+        timeout-minutes: 10
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd --continue-on-error
+
+  stage-b-test-small-1-gpu-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+        part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 --continue-on-error
+
+  stage-b-test-small-1-gpu-amd-mi35x:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-mi35x') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi35x-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x --continue-on-error
+
+  stage-b-test-large-1-gpu-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-large-1-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+        part: [0, 1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 --continue-on-error
+
+  stage-b-test-large-2-gpu-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-large-2-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-2]
+        part: [0, 1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 --continue-on-error
+
+  multimodal-gen-test-1-gpu-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'multimodal-gen-test-1-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      max-parallel: 1  # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT
+      matrix:
+        runner: [linux-mi325-gpu-1]
+        part: [0, 1]  # 2 partitions: 11 tests ÷ 2 = ~5-6 tests each
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Download artifacts
+        if: needs.check-changes.outputs.sgl_kernel == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build diffusion
+          docker exec ci_sglang pip install amdsmi
+
+      - name: Setup kernel caches
+        run: |
+          # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data)
+          # This directory persists across container restarts on the self-hosted runner
+          docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub
+
+          # Clear pre-built AITER kernels from Docker image to avoid segfaults
+          # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/
+          echo "Clearing pre-built AITER kernels from Docker image..."
+          docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true
+          docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true
+          echo "AITER kernels cleared - will be rebuilt on first use"
+
+          # Create persistent cache marker if /sgl-data is a real mount (not ephemeral)
+          # This tells the test cleanup code to NOT delete downloaded models
+          if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then
+            docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache
+            echo "Created .persistent_cache marker - HF cache will persist"
+          else
+            echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test"
+          fi
+
+          # Check MIOpen cache (VAE convolution kernels)
+          miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0")
+          echo "Found ${miopen_files} MIOpen cache files"
+
+      - name: Diagnose HF cache and system resources
+        run: |
+          echo "=== System Memory Status ==="
+          free -h
+          echo ""
+          echo "=== Disk Space ==="
+          df -h /home/runner/sgl-data 2>/dev/null || df -h
+          echo ""
+          echo "=== HF Cache Directory Structure ==="
+          docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found"
+          docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found"
+          echo ""
+          echo "=== Checking for cached diffusion models (1-GPU tests) ==="
+          # Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2
+          for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do
+            cache_path="/sgl-data/hf-cache/hub/models--${model}"
+            if docker exec ci_sglang test -d "$cache_path"; then
+              size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1)
+              echo "✓ CACHED: $model ($size)"
+            else
+              echo "✗ NOT CACHED: $model"
+            fi
+          done
+          echo ""
+          echo "=== GPU Memory Status ==="
+          docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available"
+
+      - name: Run diffusion server tests (1-GPU)
+        timeout-minutes: 60
+        run: |
+          # AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path)
+          # Tests: T2V, T2I, I2V, LoRA
+          #
+          # HF download env vars:
+          # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available)
+          # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings
+          docker exec \
+            -e SGLANG_E2E_TOLERANCE=0.3 \
+            -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \
+            -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \
+            -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \
+            -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \
+            -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \
+            -e AITER_JIT_DIR=/sgl-data/aiter-kernels \
+            -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
+            -e HF_HUB_ENABLE_HF_TRANSFER=1 \
+            -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
+            -w /sglang-checkout/python \
+            ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \
+              --suite 1-gpu \
+              --partition-id ${{ matrix.part }} \
+              --total-partitions 2 \
+              -k "not flux_2"
+
+          # Post-test diagnostics
+          echo "=== Post-test System Memory Status ==="
+          free -h
+
+  multimodal-gen-test-2-gpu-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'multimodal-gen-test-2-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      max-parallel: 1  # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT
+      matrix:
+        runner: [linux-mi325-gpu-2]
+        part: [0, 1]  # 2 partitions: 9 tests ÷ 2 = ~4-5 tests each
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Download artifacts
+        if: needs.check-changes.outputs.sgl_kernel == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build diffusion
+          docker exec ci_sglang pip install amdsmi
+
+      - name: Setup kernel caches
+        run: |
+          # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data)
+          docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub
+
+          # Clear pre-built AITER kernels from Docker image to avoid segfaults
+          # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/
+          echo "Clearing pre-built AITER kernels from Docker image..."
+          docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true
+          docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true
+          echo "AITER kernels cleared - will be rebuilt on first use"
+
+          # Create persistent cache marker if /sgl-data is a real mount (not ephemeral)
+          # This tells the test cleanup code to NOT delete downloaded models
+          if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then
+            docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache
+            echo "Created .persistent_cache marker - HF cache will persist"
+          else
+            echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test"
+          fi
+
+          # Check MIOpen cache (VAE convolution kernels)
+          miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0")
+          echo "Found ${miopen_files} MIOpen cache files"
+
+      - name: Diagnose HF cache and system resources
+        run: |
+          echo "=== System Memory Status ==="
+          free -h
+          echo ""
+          echo "=== Disk Space ==="
+          df -h /home/runner/sgl-data 2>/dev/null || df -h
+          echo ""
+          echo "=== HF Cache Directory Structure ==="
+          docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found"
+          docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found"
+          echo ""
+          echo "=== Checking for cached diffusion models (2-GPU tests) ==="
+          # Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1
+          for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do
+            cache_path="/sgl-data/hf-cache/hub/models--${model}"
+            if docker exec ci_sglang test -d "$cache_path"; then
+              size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1)
+              echo "✓ CACHED: $model ($size)"
+            else
+              echo "✗ NOT CACHED: $model"
+            fi
+          done
+          echo ""
+          echo "=== GPU Memory Status ==="
+          docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available"
+
+      - name: Run diffusion server tests (2-GPU)
+        timeout-minutes: 80
+        run: |
+          # AMD CI: All 2-GPU tests including LoRA
+          # Tests: T2V, T2I, I2V, LoRA
+          #
+          # HF download env vars:
+          # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available)
+          # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings
+          docker exec \
+            -e SGLANG_E2E_TOLERANCE=0.3 \
+            -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \
+            -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \
+            -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \
+            -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \
+            -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \
+            -e AITER_JIT_DIR=/sgl-data/aiter-kernels \
+            -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
+            -e HF_HUB_ENABLE_HF_TRANSFER=1 \
+            -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
+            -w /sglang-checkout/python \
+            ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \
+              --suite 2-gpu \
+              --partition-id ${{ matrix.part }} \
+              --total-partitions 2
+
+          # Post-test diagnostics
+          echo "=== Post-test System Memory Status ==="
+          free -h
+
+
+  stage-c-test-large-8-gpu-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-c-test-large-8-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    env:
+      RUNNER_LABELS: linux-mi325-gpu-8
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-8]
+        part: [0, 1, 2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Test RCCL multi-GPU communication
+        timeout-minutes: 5
+        run: |
+          echo "Testing RCCL multi-GPU communication with debug info..."
+          docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py"
+
+      - name: Run test
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 --continue-on-error
+
+  stage-c-test-large-8-gpu-amd-mi35x:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-c-test-large-8-gpu-amd-mi35x') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi35x-gpu-8]
+        part: [0]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Run test
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 --continue-on-error
+
+  pr-test-amd-finish:
+    needs:
+      [
+        call-gate,
+        check-changes,
+
+        sgl-kernel-unit-test-amd,
+        sgl-kernel-unit-test-2-gpu-amd,
+        multimodal-gen-test-1-gpu-amd,
+        multimodal-gen-test-2-gpu-amd,
+
+        stage-a-test-1-amd,
+        stage-b-test-small-1-gpu-amd,
+        stage-b-test-small-1-gpu-amd-mi35x,
+        stage-b-test-large-1-gpu-amd,
+        stage-b-test-large-2-gpu-amd,
+        stage-c-test-large-8-gpu-amd,
+        stage-c-test-large-8-gpu-amd-mi35x,
+      ]
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          # Convert the 'needs' context to a JSON string
+          json_needs='${{ toJson(needs) }}'
+
+          # Get a list of all job names from the JSON keys
+          job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
+
+          for job in $job_names; do
+            # For each job, extract its result
+            result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
+
+            # Print the job name and its result
+            echo "$job: $result"
+
+            # Check for failure or cancellation and exit if found
+            if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
+              echo "The above jobs failed."
+              exit 1
+            fi
+          done
+
+          # If the loop completes, all jobs were successful
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index 381cf7fecb30..26044c3a8786 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -396,7 +396,16 @@ jobs:
 
   multimodal-gen-test-1-gpu-amd:
     needs: [check-changes]
-    if: needs.check-changes.outputs.multimodal_gen == 'true'
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'multimodal-gen-test-1-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
     strategy:
       fail-fast: false
       max-parallel: 1  # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT
@@ -516,7 +525,16 @@ jobs:
 
   multimodal-gen-test-2-gpu-amd:
     needs: [check-changes]
-    if: needs.check-changes.outputs.multimodal_gen == 'true'
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'multimodal-gen-test-2-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
     strategy:
       fail-fast: false
       max-parallel: 1  # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT
diff --git a/.github/workflows/release-docker-amd-rocm720-nightly-preview.yml b/.github/workflows/release-docker-amd-rocm720-nightly-preview.yml
new file mode 100644
index 000000000000..60aee17d163d
--- /dev/null
+++ b/.github/workflows/release-docker-amd-rocm720-nightly-preview.yml
@@ -0,0 +1,82 @@
+name: Release Docker Images ROCm 7.2.0 Nightly Preview (AMD)
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 13 * * *'
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: True
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: amd-docker-scale
+    environment: 'prod'
+    strategy:
+      fail-fast: false
+      matrix:
+        gpu_arch: ['gfx942-rocm720', 'gfx950-rocm720']
+        build_type: ['all']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Required for git describe to find tags
+
+      - name: "Set Date"
+        run: |
+          echo "DATE=$(date +%Y%m%d)" >> $GITHUB_ENV
+
+      - name: Get version from latest tag
+        id: version
+        run: |
+          # Get the latest version tag sorted by version number (e.g., v0.5.7 -> 0.5.7)
+          VERSION=$(git tag -l 'v[0-9]*' --sort=-v:refname | head -1 | sed 's/^v//')
+
+          if [ -z "$VERSION" ]; then
+            echo "::error::Could not determine version from git tags"
+            exit 1
+          fi
+
+          # Get short commit hash of current HEAD
+          COMMIT_HASH=$(git rev-parse --short HEAD)
+
+          # Compose pretend version for setuptools_scm: e.g., 0.5.8.post1.dev20260211+g1a2b3c4
+          PRETEND_VERSION="${VERSION}.dev${{ env.DATE }}+g${COMMIT_HASH}"
+
+          echo "version=${VERSION}" >> $GITHUB_OUTPUT
+          echo "pretend_version=${PRETEND_VERSION}" >> $GITHUB_OUTPUT
+          echo "Detected version: ${VERSION}"
+          echo "Pretend version for pip: ${PRETEND_VERSION}"
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_AMD_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_AMD_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=${{ steps.version.outputs.version }}
+          pretend_version=${{ steps.version.outputs.pretend_version }}
+          echo "Version: ${version}"
+          echo "Pretend version: ${pretend_version}"
+
+          if [ "${{ matrix.gpu_arch }}" = "gfx942-rocm720" ]; then
+            rocm_tag="rocm720-mi30x"
+          elif [ "${{ matrix.gpu_arch }}" = "gfx950-rocm720" ]; then
+            rocm_tag="rocm720-mi35x"
+          else
+            echo "Unsupported gfx arch"
+            exit 1
+          fi
+
+          tag=v${version}-${rocm_tag}
+
+          docker build . -f docker/rocm720.Dockerfile --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic --build-arg SETUPTOOLS_SCM_PRETEND_VERSION=${pretend_version} -t rocm/sgl-dev:${tag}-${{ env.DATE }}-preview --no-cache
+          docker push rocm/sgl-dev:${tag}-${{ env.DATE }}-preview
diff --git a/docker/rocm720.Dockerfile b/docker/rocm720.Dockerfile
new file mode 100644
index 000000000000..68aa18629723
--- /dev/null
+++ b/docker/rocm720.Dockerfile
@@ -0,0 +1,502 @@
+# Usage (to build SGLang ROCm docker image):
+#   docker build --build-arg SGL_BRANCH=v0.5.8.post1 --build-arg GPU_ARCH=gfx942 -t v0.5.8.post1-rocm700-mi30x -f rocm.Dockerfile .
+#   docker build --build-arg SGL_BRANCH=v0.5.8.post1 --build-arg GPU_ARCH=gfx942-rocm720 -t v0.5.8.post1-rocm720-mi30x-preview -f rocm720.Dockerfile .
+#   docker build --build-arg SGL_BRANCH=v0.5.8.post1 --build-arg GPU_ARCH=gfx950 -t v0.5.8.post1-rocm700-mi35x -f rocm.Dockerfile .
+#   docker build --build-arg SGL_BRANCH=v0.5.8.post1 --build-arg GPU_ARCH=gfx950-rocm720 -t v0.5.8.post1-rocm720-mi35x-preview -f rocm720.Dockerfile .
+
+# Usage (to build SGLang ROCm + Mori docker image):
+#   docker build --build-arg SGL_BRANCH=v0.5.8.post1 --build-arg GPU_ARCH=gfx942 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.8.post1-rocm700-mi30x -f rocm.Dockerfile .
+#   docker build --build-arg SGL_BRANCH=v0.5.8.post1 --build-arg GPU_ARCH=gfx950 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.8.post1-rocm700-mi35x -f rocm.Dockerfile .
+
+# Default base images
+ARG BASE_IMAGE_942="rocm/sgl-dev:rocm7-vllm-20250904"
+ARG BASE_IMAGE_942_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1"
+ARG BASE_IMAGE_950="rocm/sgl-dev:rocm7-vllm-20250904"
+ARG BASE_IMAGE_950_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1"
+
+# This is necessary for scope purpose
+ARG GPU_ARCH=gfx950
+
+# ===============================
+# Base image 942 with rocm700 and args
+FROM $BASE_IMAGE_942 AS gfx942
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="0"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="1"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.10.post2"
+
+# ===============================
+# Base image 942 with rocm720 and args
+FROM $BASE_IMAGE_942_ROCM720 AS gfx942-rocm720
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="1"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="1"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.10.post2"
+
+# ===============================
+# Base image 950 and args
+FROM $BASE_IMAGE_950 AS gfx950
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="0"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="0"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.10.post2"
+
+# ===============================
+# Base image 950 with rocm720 and args
+FROM $BASE_IMAGE_950_ROCM720 AS gfx950-rocm720
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="1"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="1"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.10.post2"
+
+# ===============================
+# Chosen arch and args
+FROM ${GPU_ARCH}
+
+# This is necessary for scope purpose, again
+ARG GPU_ARCH=gfx950
+ENV GPU_ARCH_LIST=${GPU_ARCH%-*}
+
+ARG SGL_REPO="https://github.com/sgl-project/sglang.git"
+ARG SGL_BRANCH="main"
+
+# Version override for setuptools_scm (used in nightly builds)
+ARG SETUPTOOLS_SCM_PRETEND_VERSION=""
+
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+ARG TRITON_COMMIT="42270451990532c67e69d753fbd026f28fcc4840"
+
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+
+ARG LLVM_REPO="https://github.com/jrbyrnes/llvm-project.git"
+ARG LLVM_BRANCH="MainOpSelV2"
+ARG LLVM_COMMIT="6520ace8227ffe2728148d5f3b9872a870b0a560"
+
+ARG MOONCAKE_REPO="https://github.com/kvcache-ai/Mooncake.git"
+ARG MOONCAKE_COMMIT="b6a841dc78c707ec655a563453277d969fb8f38d"
+
+ARG TILELANG_REPO="https://github.com/tile-ai/tilelang.git"
+ARG TILELANG_COMMIT="ebf4a7cb8881432165ae8760e99d209d905c704a"
+
+ARG FHT_REPO="https://github.com/jeffdaily/fast-hadamard-transform.git"
+ARG FHT_BRANCH="rocm"
+ARG FHT_COMMIT="46efb7d776d38638fc39f3c803eaee3dd7016bd1"
+
+ARG ENABLE_MORI=0
+ARG NIC_BACKEND=none
+
+ARG MORI_REPO="https://github.com/ROCm/mori.git"
+ARG MORI_COMMIT="b0dce4beebeb1f26c784eee17d5fd9785ee9447f"
+
+# AMD AINIC apt repo settings
+ARG AINIC_VERSION=1.117.5
+ARG UBUNTU_CODENAME=jammy
+USER root
+
+# Install some basic utilities
+RUN python -m pip install --upgrade pip && pip install setuptools_scm
+RUN apt-get purge -y sccache; python -m pip uninstall -y sccache; rm -f "$(which sccache)"
+
+WORKDIR /sgl-workspace
+
+# -----------------------
+# llvm
+RUN if [ "$BUILD_LLVM" = "1" ]; then \
+     ENV HIP_CLANG_PATH="/sgl-workspace/llvm-project/build/bin/" \
+     git clone --single-branch ${LLVM_REPO} -b ${LLVM_BRANCH} \
+     && cd llvm-project \
+     && git checkout ${LLVM_COMMIT} \
+     && mkdir build \
+     && cd build \
+     && cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm \
+     && make -j$(nproc); \
+    fi
+
+# -----------------------
+# AITER
+# Unset setuptools_scm override so AITER gets its own version (AITER_COMMIT), not SGLang's
+# (SETUPTOOLS_SCM_PRETEND_VERSION is set later for SGLang nightly builds and would otherwise
+# leak into AITER's version when AITER uses setuptools_scm)
+ENV SETUPTOOLS_SCM_PRETEND_VERSION=
+RUN pip uninstall -y aiter \
+ && pip install psutil pybind11 # Required by AITER setup.py
+RUN git clone ${AITER_REPO} \
+ && cd aiter \
+ && git checkout ${AITER_COMMIT} \
+ && git submodule update --init --recursive
+
+# Hot patches for AITER in v0.1.10.post1
+# This is for ROCm 7.2 only, because of the image rebase from vllm
+# to rocm/pytorch.
+RUN set -eux; \
+    case "${GPU_ARCH}" in \
+      *rocm720*) \
+        echo "ROCm 7.2 flavor detected from GPU_ARCH=${GPU_ARCH}"; \
+        cd aiter \
+        && sed -i '459 s/if.*:/if False:/' aiter/ops/triton/attention/pa_mqa_logits.py; \
+        ;; \
+      *) \
+        echo "Not rocm720 (GPU_ARCH=${GPU_ARCH}), skip patch"; \
+        ;; \
+    esac
+
+RUN cd aiter \
+     && echo "[AITER] GPU_ARCH=${GPU_ARCH}" \
+     && if [ "$BUILD_AITER_ALL" = "1" ] && [ "$BUILD_LLVM" = "1" ]; then \
+          sh -c "HIP_CLANG_PATH=/sgl-workspace/llvm-project/build/bin/ PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \
+        elif [ "$BUILD_AITER_ALL" = "1" ]; then \
+          sh -c "PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \
+        else \
+          sh -c "GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \
+        fi
+
+# -----------------------
+# Build vLLM
+ARG VLLM_REPO="https://github.com/ROCm/vllm.git"
+ARG VLLM_BRANCH="9f6b92db47c3444b7a7d67451ba0c3a2d6af4c2c"
+RUN if [ "$BUILD_VLLM" = "1" ]; then \
+        git clone ${VLLM_REPO} \
+     && cd vllm \
+     && git checkout ${VLLM_BRANCH} \
+     && python -m pip install -r requirements/rocm.txt \
+     && python setup.py clean --all \
+     && python setup.py develop; \
+    fi
+
+# -----------------------
+# Build Mooncake
+ENV PATH=$PATH:/usr/local/go/bin
+
+RUN if [ "$BUILD_MOONCAKE" = "1" ]; then \
+     apt update && apt install -y zip unzip wget && \
+     apt install -y gcc make libtool autoconf  librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool  libibverbs-dev rdma-core && \
+     apt install -y openssh-server openmpi-bin openmpi-common libopenmpi-dev && \
+     git clone ${MOONCAKE_REPO} && \
+     cd Mooncake && \
+     git checkout ${MOONCAKE_COMMIT} && \
+     git submodule update --init --recursive && \
+     bash dependencies.sh -y && \
+     rm -rf /usr/local/go && \
+     wget https://go.dev/dl/go1.22.2.linux-amd64.tar.gz && \
+     tar -C /usr/local -xzf go1.22.2.linux-amd64.tar.gz && \
+     rm go1.22.2.linux-amd64.tar.gz && \
+     mkdir -p build && \
+     cd build && \
+     cmake .. -DUSE_HIP=ON -DUSE_ETCD=ON && \
+     make -j "$(nproc)" && make install; \
+    fi
+
+# -----------------------
+# Build SGLang
+ARG BUILD_TYPE=all
+
+# Set version for setuptools_scm if provided (for nightly builds). Only pass in the SGLang
+# pip install RUN so it does not affect AITER, sgl-model-gateway, TileLang, FHT, MORI, etc.
+ARG SETUPTOOLS_SCM_PRETEND_VERSION
+
+RUN pip install IPython \
+    && pip install orjson \
+    && pip install python-multipart \
+    && pip install torchao==0.9.0 \
+    && pip install pybind11
+
+RUN pip uninstall -y sgl_kernel sglang
+RUN git clone ${SGL_REPO} \
+    && cd sglang \
+    && echo "Using ${SGL_BRANCH} branch." \
+    && git checkout ${SGL_BRANCH} \
+    && cd sgl-kernel \
+    && rm -f pyproject.toml \
+    && mv pyproject_rocm.toml pyproject.toml \
+    && AMDGPU_TARGET=$GPU_ARCH_LIST python setup_rocm.py install \
+    && cd .. \
+    && rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml \
+    && if [ "$BUILD_TYPE" = "srt" ]; then \
+         export SETUPTOOLS_SCM_PRETEND_VERSION="${SETUPTOOLS_SCM_PRETEND_VERSION}" && python -m pip --no-cache-dir install -e "python[srt_hip,diffusion_hip]"; \
+       else \
+         export SETUPTOOLS_SCM_PRETEND_VERSION="${SETUPTOOLS_SCM_PRETEND_VERSION}" && python -m pip --no-cache-dir install -e "python[all_hip]"; \
+       fi
+
+RUN python -m pip cache purge
+
+# Copy config files to support MI300X in virtualized environments (MI300X_VF).  Symlinks will not be created in image build.
+RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
+         /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
+         -type f -name '*MI300X*' | xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
+
+# Install Rust toolchain for sgl-model-gateway
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
+    && rustc --version && cargo --version
+ENV CARGO_BUILD_JOBS=4
+
+# Build and install sgl-model-gateway
+RUN python3 -m pip install --no-cache-dir setuptools-rust \
+    && cd /sgl-workspace/sglang/sgl-model-gateway/bindings/python \
+    && /bin/bash -lc 'ulimit -n 8192 && cargo build --release' \
+    && python3 -m pip install --no-cache-dir . \
+    && rm -rf /root/.cache
+
+# -----------------------
+# TileLang
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LIBGL_ALWAYS_INDIRECT=1
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+RUN /bin/bash -lc 'set -euo pipefail; \
+  echo "[TileLang] Building TileLang for ${GPU_ARCH}"; \
+  # System dependencies (NO llvm-dev to avoid llvm-config-16 shadowing)
+  apt-get update && apt-get install -y --no-install-recommends \
+      build-essential git wget curl ca-certificates gnupg \
+      libgtest-dev libgmock-dev \
+      libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev \
+      python3 python3-dev python3-setuptools python3-pip python3-apt \
+      gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev \
+      cmake ninja-build pkg-config libstdc++6 software-properties-common \
+  && rm -rf /var/lib/apt/lists/*; \
+  \
+  # Prefer the container venv
+  VENV_PY="/opt/venv/bin/python"; \
+  VENV_PIP="/opt/venv/bin/pip"; \
+  if [ ! -x "$VENV_PY" ]; then VENV_PY="python3"; fi; \
+  if [ ! -x "$VENV_PIP" ]; then VENV_PIP="pip3"; fi; \
+  \
+  # Build GoogleTest static libs (Ubuntu package ships sources only)
+  cmake -S /usr/src/googletest -B /tmp/build-gtest -DBUILD_GTEST=ON -DBUILD_GMOCK=ON -DCMAKE_BUILD_TYPE=Release && \
+  cmake --build /tmp/build-gtest -j"$(nproc)" && \
+  cp -v /tmp/build-gtest/lib/*.a /usr/lib/x86_64-linux-gnu/ && \
+  rm -rf /tmp/build-gtest; \
+  \
+  # Keep setuptools < 80 (compat with base image)
+  "$VENV_PIP" install --upgrade "setuptools>=77.0.3,<80" wheel cmake ninja scikit-build-core && \
+  "$VENV_PIP" cache purge || true; \
+  \
+  # Locate ROCm llvm-config; fallback to installing LLVM 18 if missing
+  LLVM_CONFIG_PATH=""; \
+  for p in /opt/rocm/llvm/bin/llvm-config /opt/rocm/llvm-*/bin/llvm-config /opt/rocm-*/llvm*/bin/llvm-config; do \
+    if [ -x "$p" ]; then LLVM_CONFIG_PATH="$p"; break; fi; \
+  done; \
+  if [ -z "$LLVM_CONFIG_PATH" ]; then \
+    echo "[TileLang] ROCm llvm-config not found; installing LLVM 18..."; \
+    curl -fsSL https://apt.llvm.org/llvm-snapshot.gpg.key | gpg --dearmor -o /etc/apt/keyrings/llvm.gpg; \
+    echo "deb [signed-by=/etc/apt/keyrings/llvm.gpg] http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main" > /etc/apt/sources.list.d/llvm.list; \
+    apt-get update; \
+    apt-get install -y --no-install-recommends llvm-18; \
+    rm -rf /var/lib/apt/lists/*; \
+    LLVM_CONFIG_PATH="$(command -v llvm-config-18)"; \
+    if [ -z "$LLVM_CONFIG_PATH" ]; then echo "ERROR: llvm-config-18 not found after install"; exit 1; fi; \
+  fi; \
+  echo "[TileLang] Using LLVM_CONFIG at: $LLVM_CONFIG_PATH"; \
+  export PATH="$(dirname "$LLVM_CONFIG_PATH"):/usr/local/bin:${PATH}"; \
+  export LLVM_CONFIG="$LLVM_CONFIG_PATH"; \
+  \
+  # Optional shim for tools that expect llvm-config-16
+  mkdir -p /usr/local/bin && \
+  printf "#!/usr/bin/env bash\nexec \"%s\" \"\$@\"\n" "$LLVM_CONFIG_PATH" > /usr/local/bin/llvm-config-16 && \
+  chmod +x /usr/local/bin/llvm-config-16; \
+  \
+  # TVM Python bits need Cython + z3 before configure.
+  # Pin z3-solver==4.15.4.0: 4.15.4.0 has a manylinux wheel; 4.15.5.0 has no wheel and builds from source (fails: C++20 <format> needs GCC 14+, image has GCC 11).
+  "$VENV_PIP" install --no-cache-dir "cython>=0.29.36,<3.0" "apache-tvm-ffi>=0.1.6" "z3-solver==4.15.4.0"; \
+  \
+  # Clone + pin TileLang (bundled TVM), then build
+  git clone --recursive "${TILELANG_REPO}" /opt/tilelang && \
+  cd /opt/tilelang && \
+  git fetch --depth=1 origin "${TILELANG_COMMIT}" || true && \
+  git checkout -f "${TILELANG_COMMIT}" && \
+  git submodule update --init --recursive && \
+  export CMAKE_ARGS="-DUSE_CUDA=OFF -DUSE_ROCM=ON -DROCM_PATH=/opt/rocm -DLLVM_CONFIG=${LLVM_CONFIG} -DSKBUILD_SABI_VERSION= ${CMAKE_ARGS:-}" && \
+  "$VENV_PIP" install -e . -v --no-build-isolation --no-deps; \
+  if [ -f pyproject.toml ]; then sed -i "/^[[:space:]]*\"torch/d" pyproject.toml || true; fi; \
+  "$VENV_PIP" cache purge || true; \
+  "$VENV_PY" -c "import tilelang; print(tilelang.__version__)"'
+
+# -----------------------
+# Hadamard-transform (HIP build)
+RUN /bin/bash -lc 'set -euo pipefail; \
+    git clone --branch "${FHT_BRANCH}" "${FHT_REPO}" fast-hadamard-transform; \
+    cd fast-hadamard-transform; \
+    git checkout -f "${FHT_COMMIT}"; \
+    python setup.py install'
+
+# -----------------------
+# Python tools
+RUN python3 -m pip install --no-cache-dir \
+    py-spy \
+    pre-commit \
+    tabulate
+
+# -----------------------
+# MORI (optional)
+ENV PYTORCH_ROCM_ARCH=gfx942;gfx950
+RUN /bin/bash -lc 'set -euo pipefail; \
+  if [ "${ENABLE_MORI}" != "1" ]; then \
+    echo "[MORI] Skipping (ENABLE_MORI=${ENABLE_MORI})"; \
+    exit 0; \
+  fi; \
+  echo "[MORI] Enabling MORI (NIC_BACKEND=${NIC_BACKEND})"; \
+  \
+  # Base deps for MORI build
+  apt-get update && apt-get install -y --no-install-recommends \
+      build-essential \
+      g++ \
+      jq \
+      libopenmpi-dev \
+      libpci-dev \
+      initramfs-tools \
+  && rm -rf /var/lib/apt/lists/*; \
+  \
+  # NIC backend deps
+  case "${NIC_BACKEND}" in \
+    # default: mlx5
+    none) \
+      export USE_IONIC="OFF"; \
+      export USE_BNXT="OFF"; \
+      ;; \
+    # AMD NIC
+    ainic) \
+      export USE_IONIC="ON"; \
+      export USE_BNXT="OFF"; \
+      apt-get update && apt-get install -y --no-install-recommends ca-certificates curl gnupg apt-transport-https && \
+      rm -rf /var/lib/apt/lists/* && mkdir -p /etc/apt/keyrings; \
+      curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/amdainic.gpg; \
+      echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/amdainic.gpg] https://repo.radeon.com/amdainic/pensando/ubuntu/${AINIC_VERSION} ${UBUNTU_CODENAME} main" \
+        > /etc/apt/sources.list.d/amdainic.list; \
+      apt-get update && apt-get install -y --no-install-recommends \
+          libionic-dev \
+          ionic-common \
+      ; \
+      rm -rf /var/lib/apt/lists/*; \
+      ;; \
+    # TODO: Add Broadcom bnxt packages/repos here later.
+    # bnxt) \
+    #   export USE_IONIC="OFF"; \
+    #   export USE_BNXT="ON"; \
+    #   echo "[MORI] NIC_BACKEND=bnxt: USE_BNXT=ON. Add Broadcom bnxt packages/repos here later."; \
+    #   ;; \
+    *) \
+      echo "ERROR: unknown NIC_BACKEND=${NIC_BACKEND}. Use one of: none, ainic"; \
+      exit 2; \
+      ;; \
+  esac; \
+  \
+  # Build/install MORI
+  export MORI_GPU_ARCHS="${GPU_ARCH_LIST}"; \
+  echo "[MORI] MORI_GPU_ARCHS=${MORI_GPU_ARCHS} USE_IONIC=${USE_IONIC} USE_BNXT=${USE_BNXT}"; \
+  rm -rf /sgl-workspace/mori; \
+  git clone "${MORI_REPO}" /sgl-workspace/mori; \
+  cd /sgl-workspace/mori; \
+  git checkout "${MORI_COMMIT}"; \
+  git submodule update --init --recursive; \
+  python3 setup.py develop; \
+  python3 -c "import os, torch; print(os.path.join(os.path.dirname(torch.__file__), \"lib\"))" > /etc/ld.so.conf.d/torch.conf; \
+  ldconfig; \
+  echo "export PYTHONPATH=/sgl-workspace/mori:\${PYTHONPATH}" >> /etc/bash.bashrc; \
+  echo "[MORI] Done."'
+
+# -----------------------
+# Hot patch: torch-ROCm
+# The artifact hardcoded the supported triton version to be 3.5.1.
+# Rewrite the restriction directly.
+ARG TORCH_ROCM_FILE="torch-2.9.1+rocm7.2.0.lw.git7e1940d4-cp310-cp310-linux_x86_64.whl"
+RUN mkdir /tmp/whl && cd /tmp/whl \
+     && export TORCH_ROCM_FILE="${TORCH_ROCM_FILE}" \
+     && python - <<'PY'
+import zipfile, csv, os, re
+from pathlib import Path
+
+fname = os.environ["TORCH_ROCM_FILE"]
+in_whl  = Path("/")   / fname
+out_whl = Path("/tmp")/ fname
+work = Path("/tmp/whl")
+
+# 1) Extract
+with zipfile.ZipFile(in_whl, "r") as z:
+    z.extractall(work)
+
+# 2) Locate dist-info and patch METADATA (edit this logic to match your exact line)
+dist_info = next(work.glob("*.dist-info"))
+meta = dist_info / "METADATA"
+txt = meta.read_text(encoding="utf-8")
+
+# Example: replace one exact requirement form.
+# Adjust the string to match what you actually see.
+pat = r'^Requires-Dist:\s*triton==3.5.1[^\s]*;'
+txt2, n = re.subn(pat, r'triton>=3.5.1;', txt, flags=re.MULTILINE)
+if txt2 == txt:
+    raise SystemExit("Did not find expected Requires-Dist line to replace in METADATA")
+meta.write_text(txt2, encoding="utf-8")
+
+# 3) Hacky step: blank hash/size columns in RECORD
+record = dist_info / "RECORD"
+rows = []
+with record.open(newline="", encoding="utf-8") as f:
+    for r in csv.reader(f):
+        if not r:
+            continue
+        # keep filename, blank out hash and size
+        rows.append([r[0], "", ""])
+with record.open("w", newline="", encoding="utf-8") as f:
+    csv.writer(f).writerows(rows)
+
+# 4) Re-zip as a wheel
+with zipfile.ZipFile(out_whl, "w", compression=zipfile.ZIP_DEFLATED) as z:
+    for p in work.rglob("*"):
+        if p.is_file():
+            z.write(p, p.relative_to(work).as_posix())
+
+print("Wrote", out_whl)
+PY
+
+RUN python3 -m pip install --force --no-deps /tmp/${TORCH_ROCM_FILE} \
+     && rm -fr /tmp/whl /tmp/${TORCH_ROCM_FILE}
+
+# -----------------------
+# Hot patch: Triton
+# For ROCm 7.2, this custom build breaks pip dependency management,
+# so future `pip install` will break the ROCm stack.
+# A workaround for this is to reinstall the default triton
+# wheel with the `rocm/pytorch` image in the root directory.
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
+        pip uninstall -y triton \
+     && apt install -y cmake \
+     && git clone ${TRITON_REPO} triton-custom \
+     && cd triton-custom \
+     && git checkout ${TRITON_COMMIT} \
+     && pip install -r python/requirements.txt \
+     && pip install -e .; \
+    fi
+
+# -----------------------
+# Performance environment variable.
+
+# Skip CuDNN compatibility check - not applicable for ROCm (uses MIOpen instead)
+ENV SGLANG_DISABLE_CUDNN_CHECK=1
+ENV HIP_FORCE_DEV_KERNARG=1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+ENV SGLANG_INT4_WEIGHT=0
+ENV SGLANG_MOE_PADDING=1
+ENV SGLANG_ROCM_DISABLE_LINEARQUANT=0
+ENV SGLANG_ROCM_FUSED_DECODE_MLA=1
+ENV SGLANG_SET_CPU_AFFINITY=1
+ENV SGLANG_USE_AITER=1
+ENV SGLANG_USE_ROCM700A=1
+
+ENV NCCL_MIN_NCHANNELS=112
+ENV VLLM_FP8_PADDING=1
+ENV VLLM_FP8_ACT_PADDING=1
+ENV VLLM_FP8_WEIGHT_PADDING=1
+ENV VLLM_FP8_REDUCE_CONV=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
+
+CMD ["/bin/bash"]
diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py
index 3e4fb829762c..de2459a52e15 100644
--- a/python/sglang/srt/layers/layernorm.py
+++ b/python/sglang/srt/layers/layernorm.py
@@ -64,11 +64,20 @@
         gemma_rmsnorm,
         rmsnorm,
     )
+_has_vllm_rms_norm = False
 if _use_aiter:
     from aiter import rmsnorm2d_fwd as rms_norm
     from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm
+
+    _has_vllm_rms_norm = True  # aiter provides the rms_norm functions
 elif _is_hip:
-    from vllm._custom_ops import fused_add_rms_norm, rms_norm
+    try:
+        from vllm._custom_ops import fused_add_rms_norm, rms_norm
+
+        _has_vllm_rms_norm = True
+    except ImportError:
+        # Fallback: vllm not available, will use forward_native
+        _has_vllm_rms_norm = False
 
 logger = logging.getLogger(__name__)
 
@@ -181,6 +190,10 @@ def forward_hip(
         residual: Optional[torch.Tensor] = None,
         post_residual_addition: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        # Fallback to native implementation if vllm is not available
+        if not _has_vllm_rms_norm:
+            return self.forward_native(x, residual, post_residual_addition)
+
         if not x.is_contiguous():
             # NOTE: Remove this if aiter kernel supports discontinuous input
             x = x.contiguous()
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
index 17168d414d08..ebdbb42c64f5 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -57,11 +57,22 @@
             from aiter import moe_sum
         except ImportError:
             raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
-    else:
-        from vllm import _custom_ops as vllm_ops
+    # Note: vllm_ops is not needed for HIP when _use_aiter=False
+    # because the code uses moe_sum_reduce_triton as fallback (line 619)
 elif _is_xpu:
     from sgl_kernel import moe_sum_reduce, silu_and_mul
 
+# Try to import vllm_ops for non-CUDA/HIP/XPU platforms
+_has_vllm_ops = False
+if not _is_cuda and not _is_hip and not _is_xpu:
+    try:
+        from vllm import _custom_ops as vllm_ops
+
+        _has_vllm_ops = True
+    except ImportError:
+        # Fallback: vllm not available, will use native PyTorch implementations
+        _has_vllm_ops = False
+
 padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
 
 
@@ -513,9 +524,15 @@ def fused_experts_impl(
                         activation,
                     )
             else:
-                vllm_ops.silu_and_mul(
-                    intermediate_cache2, intermediate_cache1.view(-1, N)
-                )
+                if _has_vllm_ops:
+                    vllm_ops.silu_and_mul(
+                        intermediate_cache2, intermediate_cache1.view(-1, N)
+                    )
+                else:
+                    # Fallback: native PyTorch silu_and_mul
+                    x = intermediate_cache1.view(-1, N)
+                    d = x.shape[-1] // 2
+                    intermediate_cache2.copy_(F.silu(x[..., :d]) * x[..., d:])
         elif activation == "gelu" and is_gated:
             assert gemm1_alpha is None, "gemm1_alpha is not supported for gelu"
             assert gemm1_limit is None, "gemm1_limit is not supported for gelu"
@@ -533,9 +550,15 @@ def fused_experts_impl(
                         activation,
                     )
             else:
-                vllm_ops.gelu_and_mul(
-                    intermediate_cache2, intermediate_cache1.view(-1, N)
-                )
+                if _has_vllm_ops:
+                    vllm_ops.gelu_and_mul(
+                        intermediate_cache2, intermediate_cache1.view(-1, N)
+                    )
+                else:
+                    # Fallback: native PyTorch gelu_and_mul
+                    x = intermediate_cache1.view(-1, N)
+                    d = x.shape[-1] // 2
+                    intermediate_cache2.copy_(F.gelu(x[..., :d]) * x[..., d:])
         # Activation function without multiplication
         elif activation == "silu" and not is_gated:
             intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
@@ -634,10 +657,18 @@ def fused_experts_impl(
                 routed_scaling_factor,
             )
         else:
-            vllm_ops.moe_sum(
-                intermediate_cache3.view(*intermediate_cache3.shape),
-                out_hidden_states[begin_chunk_idx:end_chunk_idx],
-            )
+            if _has_vllm_ops:
+                vllm_ops.moe_sum(
+                    intermediate_cache3.view(*intermediate_cache3.shape),
+                    out_hidden_states[begin_chunk_idx:end_chunk_idx],
+                )
+            else:
+                # Fallback: use triton moe_sum_reduce when vllm is not available
+                moe_sum_reduce_triton(
+                    intermediate_cache3.view(*intermediate_cache3.shape),
+                    out_hidden_states[begin_chunk_idx:end_chunk_idx],
+                    routed_scaling_factor,
+                )
 
     return out_hidden_states
 
diff --git a/python/sglang/srt/layers/moe/moe_runner/triton.py b/python/sglang/srt/layers/moe/moe_runner/triton.py
index 583b23adacd0..be40253b3ef8 100644
--- a/python/sglang/srt/layers/moe/moe_runner/triton.py
+++ b/python/sglang/srt/layers/moe/moe_runner/triton.py
@@ -41,6 +41,7 @@
     from sgl_kernel import gelu_and_mul, silu_and_mul
 
     if _is_hip:
+        _has_vllm = False
         if _use_aiter:
             try:
                 from aiter import moe_sum
@@ -49,7 +50,13 @@
                     "aiter is required when SGLANG_USE_AITER is set to True"
                 )
         else:
-            from vllm import _custom_ops as vllm_ops  # moe_sum
+            try:
+                from vllm import _custom_ops as vllm_ops  # moe_sum
+
+                _has_vllm = True
+            except ImportError:
+                # Fallback: vllm not available, will use triton moe_sum
+                _has_vllm = False
 elif _is_cpu and _is_cpu_amx_available:
     pass
 elif _is_xpu:
@@ -314,11 +321,18 @@ def run(
                     intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states,
                 )
-            else:
+            elif _has_vllm:
                 vllm_ops.moe_sum(
                     intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states,
                 )
+            else:
+                # Fallback: use triton moe_sum when vllm is not available
+                moe_sum_reduce_triton(
+                    intermediate_cache3.view(*intermediate_cache3.shape),
+                    out_hidden_states,
+                    routed_scaling_factor,
+                )
         elif _is_xpu:
             moe_sum_reduce(
                 intermediate_cache3.view(*intermediate_cache3.shape),
diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py
index 2022c3e8b9e3..7558b950ad58 100644
--- a/python/sglang/srt/layers/quantization/fp8_kernel.py
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -64,6 +64,7 @@
         enable_sgl_per_token_group_quant_8bit = False
 
 if _is_hip:
+    _has_vllm = False
     if _use_aiter:
         try:
             from aiter import (  # v0.1.3
@@ -76,8 +77,11 @@
     else:
         try:
             import vllm._C  # noqa: F401
+
+            _has_vllm = True
         except ImportError:
-            raise ImportError("vllm is required when SGLANG_USE_AITER is set to False")
+            # Fallback: vllm not available, will use native PyTorch implementation
+            _has_vllm = False
 
 logger = logging.getLogger(__name__)
 
@@ -1537,6 +1541,37 @@ def per_token_group_quant_mla_deep_gemm_masked_fp8(
 """
 if _is_hip:
 
+    def _native_dynamic_per_token_quant_fp8(output, input, scale):
+        """Native PyTorch fallback for dynamic per-token FP8 quantization when vLLM is unavailable."""
+        M, N = input.shape
+        eps = 1e-12
+        # Compute per-token scale
+        absmax = input.abs().max(dim=1, keepdim=True).values
+        absmax = torch.clamp(absmax, min=eps)
+        scale_val = absmax / fp8_max
+        scale.copy_(scale_val)
+        # Quantize
+        output_data = torch.clamp(input / scale_val, fp8_min, fp8_max).to(fp8_dtype)
+        output.copy_(output_data)
+
+    def _native_dynamic_per_tensor_quant_fp8(output, input, scale):
+        """Native PyTorch fallback for dynamic per-tensor FP8 quantization when vLLM is unavailable."""
+        eps = 1e-12
+        absmax = input.abs().max()
+        absmax = torch.clamp(absmax, min=eps)
+        scale_val = absmax / fp8_max
+        # Use copy_ instead of fill_ with .item() to avoid CPU-GPU sync
+        scale.view(-1).copy_(scale_val.view(-1))
+        # Quantize
+        output_data = torch.clamp(input / scale_val, fp8_min, fp8_max).to(fp8_dtype)
+        output.copy_(output_data)
+
+    def _native_static_quant_fp8(output, input, scale):
+        """Native PyTorch fallback for static FP8 quantization when vLLM is unavailable."""
+        # Use tensor directly instead of .item() to avoid CPU-GPU sync
+        output_data = torch.clamp(input / scale, fp8_min, fp8_max).to(fp8_dtype)
+        output.copy_(output_data)
+
     def scaled_fp8_quant(
         input: torch.Tensor,
         scale: Optional[torch.Tensor] = None,
@@ -1557,16 +1592,20 @@ def scaled_fp8_quant(
                 )
                 if _use_aiter:
                     dynamic_per_token_scaled_quant(output, input, scale)
-                else:
+                elif _has_vllm:
                     torch.ops._C.dynamic_per_token_scaled_fp8_quant(
                         output, input.contiguous(), scale, None
                     )
+                else:
+                    _native_dynamic_per_token_quant_fp8(output, input, scale)
             else:
                 scale = torch.zeros(1, device=input.device, dtype=torch.float32)
                 if _use_aiter:
                     dynamic_per_tensor_quant(output, input, scale)
-                else:
+                elif _has_vllm:
                     torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+                else:
+                    _native_dynamic_per_tensor_quant_fp8(output, input, scale)
         else:
             # Static scaling
             assert (
@@ -1574,8 +1613,10 @@ def scaled_fp8_quant(
             ), f"Expected scalar scale, got numel={scale.numel()}"
             if _use_aiter:
                 static_per_tensor_quant(output, input, scale)
-            else:
+            elif _has_vllm:
                 torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+            else:
+                _native_static_quant_fp8(output, input, scale)
 
         return output, scale
 
diff --git a/python/sglang/srt/layers/quantization/unquant.py b/python/sglang/srt/layers/quantization/unquant.py
index 898d0c4b051b..c77e7d21f3da 100644
--- a/python/sglang/srt/layers/quantization/unquant.py
+++ b/python/sglang/srt/layers/quantization/unquant.py
@@ -224,7 +224,10 @@ def create_weights(
             set_weight_attrs(w2_weight_bias, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        if _use_aiter:
+        # Skip aiter weight shuffle when using non-auto MoE backend (e.g., triton, triton_kernels)
+        # because aiter CK kernels don't support all GEMM dimensions
+        _should_use_aiter_moe = _use_aiter and get_moe_runner_backend().is_auto()
+        if _should_use_aiter_moe:
             layer.w13_weight = torch.nn.Parameter(
                 shuffle_weight(layer.w13_weight.data, (16, 16)),
                 requires_grad=False,
@@ -383,7 +386,10 @@ def forward_cuda(
             )[0]
             return StandardCombineInput(hidden_states=output)
         else:
-            if _use_aiter:
+            # Skip aiter fused_moe when using non-auto MoE backend (e.g., triton, triton_kernels)
+            # because aiter CK kernels don't support all GEMM dimensions
+            _should_use_aiter_moe = _use_aiter and get_moe_runner_backend().is_auto()
+            if _should_use_aiter_moe:
                 assert not moe_runner_config.no_combine, "unsupported"
                 topk_weights, topk_ids, _ = topk_output
                 if moe_runner_config.apply_router_weight_on_input:
diff --git a/python/sglang/srt/models/deepseek_janus_pro.py b/python/sglang/srt/models/deepseek_janus_pro.py
index 2167c482478e..d8298e61f7aa 100644
--- a/python/sglang/srt/models/deepseek_janus_pro.py
+++ b/python/sglang/srt/models/deepseek_janus_pro.py
@@ -1955,7 +1955,7 @@ def __init__(
         self.language_model = LlamaForCausalLM(
             language_config, quant_config=quant_config
         )
-        self.logits_processor = LogitsProcessor(config)
+        self.logits_processor = LogitsProcessor(language_config)
 
     def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
         pixel_values = torch.concat([item.feature for item in items], dim=0)
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 9e9a2c6263c1..8bfe759b894e 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -1374,6 +1374,13 @@ def _handle_model_specific_adjustments(self):
                     logger.warning(
                         "Detected ROCm and MXFP4 quantization format for GPT-OSS model, enabling aiter MXFP4 MOE kernel."
                     )
+                elif is_hip() and get_bool_env_var("SGLANG_USE_AITER"):
+                    # For GPT-OSS bf16 on ROCm with aiter, use triton backend
+                    # because aiter CK kernel doesn't support all GEMM dimensions
+                    self.moe_runner_backend = "triton"
+                    logger.warning(
+                        "Detected ROCm with SGLANG_USE_AITER for GPT-OSS bf16 model, using triton MOE kernel."
+                    )
                 elif self.ep_size == 1 and is_triton_kernels_available():
                     self.moe_runner_backend = "triton_kernel"
                     logger.warning(
diff --git a/python/sglang/test/gpt_oss_common.py b/python/sglang/test/gpt_oss_common.py
index 68402b5e0f7d..3f9c6bc974a8 100644
--- a/python/sglang/test/gpt_oss_common.py
+++ b/python/sglang/test/gpt_oss_common.py
@@ -41,7 +41,8 @@ def run_test(
 
         if model_variant == "20b":
             other_args += ["--cuda-graph-max-bs", "600"]
-        if _is_hip:
+        # Respect SGLANG_USE_AITER if already set, otherwise default to "0" for HIP
+        if _is_hip and "SGLANG_USE_AITER" not in os.environ:
             os.environ["SGLANG_USE_AITER"] = "0"
         self._run_test_raw(
             model=model,
diff --git a/python/sglang/test/nightly_utils.py b/python/sglang/test/nightly_utils.py
index e264c7c21efe..d45de1b69951 100644
--- a/python/sglang/test/nightly_utils.py
+++ b/python/sglang/test/nightly_utils.py
@@ -228,6 +228,7 @@ def run_benchmark_for_model(
         variant: str = "",
         extra_bench_args: Optional[List[str]] = None,
         enable_profile: bool = True,
+        timeout: Optional[int] = None,
     ) -> Tuple[List[BenchmarkResult], bool, Optional[float]]:
         """Run a complete benchmark for a single model with server management.
 
@@ -247,6 +248,7 @@ def run_benchmark_for_model(
             variant: Optional variant suffix (e.g., "basic", "mtp")
             extra_bench_args: Extra arguments for the benchmark command
             enable_profile: Whether to enable profiling (default True for NVIDIA)
+            timeout: Optional timeout for server launch (defaults to DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH)
 
         Returns:
             Tuple of (list of BenchmarkResult objects, success_bool, avg_spec_accept_length or None)
@@ -260,7 +262,9 @@ def run_benchmark_for_model(
             model=model_path,
             base_url=self.base_url,
             other_args=other_args or [],
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            timeout=(
+                timeout if timeout is not None else DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+            ),
         )
 
         try:
diff --git a/scripts/ci/amd/amd_ci_install_dependency.sh b/scripts/ci/amd/amd_ci_install_dependency.sh
index b06c9638f5ec..0aa3db11412b 100755
--- a/scripts/ci/amd/amd_ci_install_dependency.sh
+++ b/scripts/ci/amd/amd_ci_install_dependency.sh
@@ -2,6 +2,27 @@
 set -euo pipefail
 HOSTNAME_VALUE=$(hostname)
 GPU_ARCH="mi30x"   # default
+SKIP_TT_DEPS=""
+SKIP_SGLANG_BUILD=""
+SKIP_AITER_BUILD=""
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --skip-aiter-build) SKIP_AITER_BUILD="1"; shift;;
+    --skip-sglang-build) SKIP_SGLANG_BUILD="1"; shift;;
+    --skip-test-time-deps) SKIP_TT_DEPS="1"; shift;;
+    -h|--help)
+      echo "Usage: $0 [OPTIONS] [OPTIONAL_DEPS]"
+      echo "Options:"
+      echo "  --skip-sglang-build         Don't build checkout sglang, use what was shipped with the image"
+      echo "  --skip-aiter-build          Don't build aiter, use what was shipped with the image"
+      echo "  --skip-test-time-deps       Don't build miscellaneous dependencies"
+      exit 0
+      ;;
+    *) break ;;
+  esac
+done
+
 OPTIONAL_DEPS="${1:-}"
 
 # Build python extras
@@ -23,15 +44,6 @@ fi
 # Fix permissions on pip cache, ignore errors from concurrent access or missing temp files
 docker exec ci_sglang chown -R root:root /sgl-data/pip-cache 2>/dev/null || true
 docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache --upgrade pip
-docker exec ci_sglang pip uninstall sgl-kernel -y || true
-docker exec ci_sglang pip uninstall sglang -y || true
-# Clear Python cache to ensure latest code is used
-docker exec ci_sglang find /opt/venv -name "*.pyc" -delete || true
-docker exec ci_sglang find /opt/venv -name "__pycache__" -type d -exec rm -rf {} + || true
-# Also clear cache in sglang-checkout
-docker exec ci_sglang find /sglang-checkout -name "*.pyc" -delete || true
-docker exec ci_sglang find /sglang-checkout -name "__pycache__" -type d -exec rm -rf {} + || true
-docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
 
 # Helper function to install with retries and fallback PyPI mirror
 install_with_retry() {
@@ -93,75 +105,82 @@ git_clone_with_retry() {
   return 1
 }
 
+# Install checkout sglang
+if [ -n "$SKIP_SGLANG_BUILD" ]; then
+  echo "Didn't build checkout SGLang"
+else
+  docker exec ci_sglang pip uninstall sgl-kernel -y || true
+  docker exec ci_sglang pip uninstall sglang -y || true
+  # Clear Python cache to ensure latest code is used
+  docker exec ci_sglang find /opt/venv -name "*.pyc" -delete || true
+  docker exec ci_sglang find /opt/venv -name "__pycache__" -type d -exec rm -rf {} + || true
+  # Also clear cache in sglang-checkout
+  docker exec ci_sglang find /sglang-checkout -name "*.pyc" -delete || true
+  docker exec ci_sglang find /sglang-checkout -name "__pycache__" -type d -exec rm -rf {} + || true
+  docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
+
+  docker exec ci_sglang bash -c 'rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml'
+  install_with_retry docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e "python[${EXTRAS}]"
+fi
 
-
-case "${GPU_ARCH}" in
-  mi35x)
-    echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
-    docker exec ci_sglang rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml
-    # Follow the same dependency installation flow as mi30x/mi300/mi325.
-    install_with_retry docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e "python[${EXTRAS}]"
-    # For lmms_evals evaluating MMMU
-    docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
-    install_with_retry docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
-    ;;
-  mi30x|mi300|mi325)
-    echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
-    docker exec ci_sglang rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml
-    install_with_retry docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e "python[${EXTRAS}]"
-    # For lmms_evals evaluating MMMU
-    docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
-    install_with_retry docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
-    ;;
-  *)
-    echo "Runner architecture '${GPU_ARCH}' unrecognised;" >&2
-    ;;
-esac
-
-#docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
-git_clone_with_retry https://github.com/merrymercy/human-eval.git human-eval
-docker cp human-eval ci_sglang:/
-# Ensure setuptools is installed (human-eval's setup.py imports pkg_resources)
-docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache setuptools
-install_with_retry docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache --no-build-isolation -e .
-
-docker exec -w / ci_sglang mkdir -p /dummy-grok
-# Create dummy grok config inline (bypasses Azure blob storage which may have auth issues)
-mkdir -p dummy-grok
-cat > dummy-grok/config.json << 'EOF'
-{
-  "architectures": [
-    "Grok1ModelForCausalLM"
-  ],
-  "embedding_multiplier_scale": 78.38367176906169,
-  "output_multiplier_scale": 0.5773502691896257,
-  "vocab_size": 131072,
-  "hidden_size": 6144,
-  "intermediate_size": 32768,
-  "max_position_embeddings": 8192,
-  "num_experts_per_tok": 2,
-  "num_local_experts": 8,
-  "num_attention_heads": 48,
-  "num_hidden_layers": 64,
-  "num_key_value_heads": 8,
-  "head_dim": 128,
-  "rms_norm_eps": 1e-05,
-  "rope_theta": 10000.0,
-  "model_type": "mixtral",
-  "torch_dtype": "bfloat16"
-}
+if [[ -n "${SKIP_TT_DEPS}" ]]; then
+  echo "Didn't build lmms_eval, human-eval, and others"
+else
+  # For lmms_evals evaluating MMMU
+  docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+  install_with_retry docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+
+  git_clone_with_retry https://github.com/akao-amd/human-eval.git human-eval
+  docker cp human-eval ci_sglang:/
+  install_with_retry docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+
+  docker exec -w / ci_sglang mkdir -p /dummy-grok
+  # Create dummy grok config inline (bypasses Azure blob storage which may have auth issues)
+  mkdir -p dummy-grok
+  cat > dummy-grok/config.json << 'EOF'
+  {
+    "architectures": [
+      "Grok1ModelForCausalLM"
+    ],
+    "embedding_multiplier_scale": 78.38367176906169,
+    "output_multiplier_scale": 0.5773502691896257,
+    "vocab_size": 131072,
+    "hidden_size": 6144,
+    "intermediate_size": 32768,
+    "max_position_embeddings": 8192,
+    "num_experts_per_tok": 2,
+    "num_local_experts": 8,
+    "num_attention_heads": 48,
+    "num_hidden_layers": 64,
+    "num_key_value_heads": 8,
+    "head_dim": 128,
+    "rms_norm_eps": 1e-05,
+    "rope_theta": 10000.0,
+    "model_type": "mixtral",
+    "torch_dtype": "bfloat16"
+  }
 EOF
-docker cp ./dummy-grok ci_sglang:/
+  # docker exec -w / ci_sglang mkdir -p /dummy-grok
+  # mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
+  # docker cp ./dummy-grok ci_sglang:/
 
-docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache huggingface_hub[hf_xet]
-docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest
+  docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache huggingface_hub[hf_xet]
+  docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest
 
-# Install tvm-ffi for JIT kernel support (QK-norm, etc.)
-echo "Installing tvm-ffi for JIT kernel support..."
-docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache git+https://github.com/apache/tvm-ffi.git || echo "tvm-ffi installation failed, JIT kernels will use fallback"
+  # Install tvm-ffi for JIT kernel support (QK-norm, etc.)
+  echo "Installing tvm-ffi for JIT kernel support..."
+  docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache git+https://github.com/apache/tvm-ffi.git || echo "tvm-ffi installation failed, JIT kernels will use fallback"
 
-# Install cache-dit for qwen_image_t2i_cache_dit_enabled test (added in PR 16204)
-docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache cache-dit || echo "cache-dit installation failed"
+  # Install cache-dit for qwen_image_t2i_cache_dit_enabled test (added in PR 16204)
+  docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache cache-dit || echo "cache-dit installation failed"
+
+  # Install accelerate for distributed training and inference support
+  docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache accelerate || echo "accelerate installation failed"
+fi
+
+if [[ -n "${SKIP_AITER_BUILD}" ]]; then
+  exit 0
+fi
 
 # Detect AITER version
 #############################################
@@ -215,16 +234,17 @@ echo "[CI-AITER-CHECK] AITER version inside CI image: ${IMAGE_AITER_VERSION}"
 #############################################
 NEED_REBUILD="false"
 
-if [[ "${IMAGE_AITER_VERSION}" == "none" ]]; then
-    echo "[CI-AITER-CHECK] No AITER found in image"
-    NEED_REBUILD="true"
-elif [[ "${IMAGE_AITER_VERSION}" != "${REPO_AITER_COMMIT}" ]]; then
-    echo "[CI-AITER-CHECK] Version mismatch:"
-    echo "     Image: ${IMAGE_AITER_VERSION}"
-    echo "     Repo : ${REPO_AITER_COMMIT}"
+if [[ "${IMAGE_AITER_VERSION}" == "vnone" || "${IMAGE_AITER_VERSION}" == "v" ]]; then
+    echo "[CI-AITER-CHECK] No AITER found in image → rebuild needed"
     NEED_REBUILD="true"
+elif [[ "${IMAGE_AITER_VERSION}" == "${REPO_AITER_COMMIT}" ]]; then
+    echo "[CI-AITER-CHECK] AITER version matches"
+elif [[ "${IMAGE_AITER_VERSION}" =~ (dev|\+g[0-9a-f]+) ]]; then
+    # Dev/patched version (contains 'dev' or git hash) → preserve it
+    echo "[CI-AITER-CHECK] Dev/patched version detected: ${IMAGE_AITER_VERSION} → skipping rebuild"
 else
-    echo "[CI-AITER-CHECK] AITER version matches → using image's version."
+    echo "[CI-AITER-CHECK] Version mismatch: image=${IMAGE_AITER_VERSION}, repo=${REPO_AITER_COMMIT}"
+    NEED_REBUILD="true"
 fi
 
 
@@ -270,12 +290,12 @@ fi
 echo "[CI-AITER-CHECK] === AITER VERSION CHECK END ==="
 
 
-# Clear pre-built AITER kernels from Docker image to avoid segfaults
-# The Docker image may contain pre-compiled kernels incompatible with the current environment
-echo "Clearing pre-built AITER kernels from Docker image..."
-docker exec ci_sglang find /sgl-workspace/aiter/aiter/jit -name "*.so" -delete 2>/dev/null || true
-docker exec ci_sglang ls -la /sgl-workspace/aiter/aiter/jit/ 2>/dev/null || echo "jit dir empty or not found"
+# # Clear pre-built AITER kernels from Docker image to avoid segfaults
+# # The Docker image may contain pre-compiled kernels incompatible with the current environment
+# echo "Clearing pre-built AITER kernels from Docker image..."
+# docker exec ci_sglang find /sgl-workspace/aiter/aiter/jit -name "*.so" -delete 2>/dev/null || true
+# docker exec ci_sglang ls -la /sgl-workspace/aiter/aiter/jit/ 2>/dev/null || echo "jit dir empty or not found"
 
-# Pre-build AITER kernels to avoid timeout during tests
-echo "Warming up AITER JIT kernels..."
-docker exec -e SGLANG_USE_AITER=1 ci_sglang python3 /sglang-checkout/scripts/ci/amd/amd_ci_warmup_aiter.py || echo "AITER warmup completed (some kernels may not be available)"
+# # Pre-build AITER kernels to avoid timeout during tests
+# echo "Warming up AITER JIT kernels..."
+# docker exec -e SGLANG_USE_AITER=1 ci_sglang python3 /sglang-checkout/scripts/ci/amd/amd_ci_warmup_aiter.py || echo "AITER warmup completed (some kernels may not be available)"
diff --git a/scripts/ci/amd/amd_ci_start_container.sh b/scripts/ci/amd/amd_ci_start_container.sh
index ad6cc198bf89..a7a750ff7e99 100755
--- a/scripts/ci/amd/amd_ci_start_container.sh
+++ b/scripts/ci/amd/amd_ci_start_container.sh
@@ -27,13 +27,32 @@ DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x"
 # Parse command line arguments
 MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
 MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}"
+CUSTOM_IMAGE=""
+BUILD_FROM_DOCKERFILE=""
+GPU_ARCH_BUILD=""
 
 while [[ $# -gt 0 ]]; do
   case $1 in
     --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;;
     --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;;
+    --custom-image) CUSTOM_IMAGE="$2"; shift 2;;
+    --build-from-dockerfile) BUILD_FROM_DOCKERFILE="1"; shift;;
+    --gpu-arch) GPU_ARCH_BUILD="$2"; shift 2;;
+    --rocm-version)
+      ROCM_VERSION="$2"
+      MI30X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi30x"
+      MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x"
+      echo "Using ROCm version override: ${ROCM_VERSION}"
+      shift 2;;
     -h|--help)
-      echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]"
+      echo "Usage: $0 [OPTIONS]"
+      echo "Options:"
+      echo "  --mi30x-base-tag TAG       Override MI30x base image tag"
+      echo "  --mi35x-base-tag TAG       Override MI35x base image tag"
+      echo "  --custom-image IMAGE       Use a specific Docker image directly"
+      echo "  --build-from-dockerfile    Build image from docker/rocm.Dockerfile"
+      echo "  --gpu-arch ARCH            GPU architecture for Dockerfile build (e.g., gfx950-rocm720)"
+      echo "  --rocm-version VERSION     Override ROCm version for image lookup (e.g., rocm720)"
       exit 0
       ;;
     *) echo "Unknown option $1"; exit 1;;
@@ -54,7 +73,7 @@ else
   echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
 fi
 
-# Normalise / collapse architectures we don’t yet build specifically for
+# Normalise / collapse architectures we don't yet build specifically for
 case "${GPU_ARCH}" in
   mi35x)
     echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
@@ -134,18 +153,73 @@ find_latest_image() {
   fi
 
   echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
-  echo "Using hard-coded fallback…" >&2
-  if [[ "${gpu_arch}" == "mi35x" ]]; then
-    echo "rocm/sgl-dev:v0.5.5-rocm700-mi35x-20251110"
+  echo "Using hard-coded fallback for ${ROCM_VERSION}…" >&2
+  case "${ROCM_VERSION}" in
+    rocm720)
+      if [[ "${gpu_arch}" == "mi35x" ]]; then
+        echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260211-preview"
+      else
+        echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi30x-20260211-preview"
+      fi
+      ;;
+    rocm700)
+      if [[ "${gpu_arch}" == "mi35x" ]]; then
+        echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi35x-20260211"
+      else
+        echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi30x-20260211"
+      fi
+      ;;
+    *)
+      echo "Error: no hard-coded fallback available for ${ROCM_VERSION}" >&2
+      return 1
+      ;;
+  esac
+}
+
+# Determine which image to use
+if [[ -n "${CUSTOM_IMAGE}" ]]; then
+  # Use explicitly provided custom image
+  IMAGE="${CUSTOM_IMAGE}"
+  echo "Using custom image: ${IMAGE}"
+  docker pull "${IMAGE}"
+elif [[ -n "${BUILD_FROM_DOCKERFILE}" ]]; then
+  # Build image from Dockerfile
+  if [[ -z "${GPU_ARCH_BUILD}" ]]; then
+    echo "Error: --gpu-arch is required when using --build-from-dockerfile" >&2
+    exit 1
+  fi
+
+  DOCKERFILE_DIR="${GITHUB_WORKSPACE:-$PWD}/docker"
+
+  # Use rocm720.Dockerfile for ROCm 7.2 builds, otherwise use rocm.Dockerfile
+  if [[ "${GPU_ARCH_BUILD}" == *"rocm720"* ]]; then
+    DOCKERFILE="${DOCKERFILE_DIR}/rocm720.Dockerfile"
   else
-    echo "rocm/sgl-dev:v0.5.5-rocm700-mi30x-20251110"
+    DOCKERFILE="${DOCKERFILE_DIR}/rocm.Dockerfile"
   fi
-}
 
-# Pull and run the latest image
-IMAGE=$(find_latest_image "${GPU_ARCH}")
-echo "Pulling Docker image: ${IMAGE}"
-docker pull "${IMAGE}"
+  if [[ ! -f "${DOCKERFILE}" ]]; then
+    echo "Error: Dockerfile not found at ${DOCKERFILE}" >&2
+    exit 1
+  fi
+
+  IMAGE="sglang-ci:${GPU_ARCH_BUILD}-$(date +%Y%m%d)"
+  echo "Building Docker image from ${DOCKERFILE} with GPU_ARCH=${GPU_ARCH_BUILD}..."
+
+  # Pass full GPU_ARCH (e.g., gfx950-rocm720) - Dockerfile handles stripping suffix
+  docker build \
+    --build-arg GPU_ARCH="${GPU_ARCH_BUILD}" \
+    --build-arg SGL_BRANCH="main" \
+    -t "${IMAGE}" \
+    -f "${DOCKERFILE}" \
+    "${DOCKERFILE_DIR}"
+  echo "Successfully built image: ${IMAGE}"
+else
+  # Find the latest pre-built image
+  IMAGE=$(find_latest_image "${GPU_ARCH}")
+  echo "Pulling Docker image: ${IMAGE}"
+  docker pull "${IMAGE}"
+fi
 
 CACHE_HOST=/home/runner/sgl-data
 if [[ -d "$CACHE_HOST" ]]; then
@@ -156,6 +230,7 @@ fi
 
 echo "Launching container: ci_sglang"
 docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
+  --ulimit nofile=65536:65536 \
   -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
   $CACHE_VOLUME \
   --group-add video \
diff --git a/test/registered/amd/accuracy/mi30x/test_gpt_oss_eval_amd.py b/test/registered/amd/accuracy/mi30x/test_gpt_oss_eval_amd.py
index 069841256e19..0ae795547ad6 100644
--- a/test/registered/amd/accuracy/mi30x/test_gpt_oss_eval_amd.py
+++ b/test/registered/amd/accuracy/mi30x/test_gpt_oss_eval_amd.py
@@ -68,7 +68,7 @@ def __post_init__(self):
             "triton",
             "--trust-remote-code",
         ],
-        env_vars={"SGLANG_USE_AITER": "0"},
+        env_vars={"SGLANG_USE_AITER": "1"},
     ),
     ModelConfig(
         model_path="lmsys/gpt-oss-120b-bf16",
@@ -86,7 +86,7 @@ def __post_init__(self):
             "triton",
             "--trust-remote-code",
         ],
-        env_vars={"SGLANG_USE_AITER": "0"},
+        env_vars={"SGLANG_USE_AITER": "1"},
     ),
 ]
 
diff --git a/test/registered/amd/accuracy/mi30x/test_gsm8k_eval_amd.py b/test/registered/amd/accuracy/mi30x/test_gsm8k_eval_amd.py
index deb689526046..d29406c70e95 100644
--- a/test/registered/amd/accuracy/mi30x/test_gsm8k_eval_amd.py
+++ b/test/registered/amd/accuracy/mi30x/test_gsm8k_eval_amd.py
@@ -41,7 +41,7 @@
     # Llama 3.2 series (smaller models)
     "meta-llama/Llama-3.2-3B-Instruct": 0.55,
     # Mistral series
-    "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
+    "mistralai/Mistral-7B-Instruct-v0.3": 0.55,
     "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.61,
     # DeepSeek series
     "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
@@ -108,10 +108,10 @@ def remove_failing_models(model_str):
     "neuralmagic/Qwen2-57B-A14B-Instruct-FP8",
 }
 TRITON_MOE_MODELS = {
-    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8",
+    # "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8",
     "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8",
-    "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "mistralai/Mistral-7B-Instruct-v0.3",
+    # "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    # "mistralai/Mistral-7B-Instruct-v0.3",
 }
 # AMD-specific models that need special launch config (matching in-house CI sanity_check.py)
 # AMD_SPECIAL_CONFIG_MODELS = {
diff --git a/test/registered/amd/accuracy/mi30x/test_vlms_mmmu_eval_amd.py b/test/registered/amd/accuracy/mi30x/test_vlms_mmmu_eval_amd.py
index ec1e49476bb9..e896c6c26bd4 100644
--- a/test/registered/amd/accuracy/mi30x/test_vlms_mmmu_eval_amd.py
+++ b/test/registered/amd/accuracy/mi30x/test_vlms_mmmu_eval_amd.py
@@ -120,9 +120,9 @@
 
 # Models that need special handling on AMD (MoE models)
 TRITON_ATTENTION_MODELS = {
-    "deepseek-ai/deepseek-vl2-small",
-    "Qwen/Qwen3-VL-30B-A3B-Instruct",
-    "moonshotai/Kimi-VL-A3B-Instruct",
+    # "deepseek-ai/deepseek-vl2-small",
+    # "Qwen/Qwen3-VL-30B-A3B-Instruct",
+    # "moonshotai/Kimi-VL-A3B-Instruct",
 }
 
 # Models known to fail on AMD - exclude from testing
diff --git a/test/registered/amd/accuracy/mi35x/test_deepseek_v32_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_deepseek_v32_eval_mi35x.py
index 70f851d9d326..0b5a4a71eb52 100644
--- a/test/registered/amd/accuracy/mi35x/test_deepseek_v32_eval_mi35x.py
+++ b/test/registered/amd/accuracy/mi35x/test_deepseek_v32_eval_mi35x.py
@@ -32,9 +32,9 @@
 )
 from sglang.utils import download_and_cache_file, read_jsonl
 
-# Register for AMD CI - MI35x DeepSeek-V3.2 accuracy test (~60 min for basic only)
+# Register for AMD CI - MI35x DeepSeek-V3.2 accuracy test (~90 min for basic only)
 register_amd_ci(
-    est_time=3600,
+    est_time=5400,
     suite="nightly-amd-8-gpu-mi35x-deepseek-v32",
     nightly=True,
 )
@@ -74,7 +74,7 @@ def get_display_name(self) -> str:
         model_path="deepseek-ai/DeepSeek-V3.2",
         tp_size=8,
         accuracy_threshold=0.93,
-        timeout=3600,
+        timeout=5400,
         variant="basic",
         other_args=[
             "--trust-remote-code",
diff --git a/test/registered/amd/accuracy/mi35x/test_deepseek_v32_mtp_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_deepseek_v32_mtp_eval_mi35x.py
index 9dd254f84b0e..09a012043416 100644
--- a/test/registered/amd/accuracy/mi35x/test_deepseek_v32_mtp_eval_mi35x.py
+++ b/test/registered/amd/accuracy/mi35x/test_deepseek_v32_mtp_eval_mi35x.py
@@ -22,7 +22,6 @@
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
 from sglang.test.send_one import BenchArgs, send_one_prompt
 from sglang.test.test_utils import (
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
     is_in_ci,
@@ -32,7 +31,7 @@
 
 # Register for AMD CI - MI35x DeepSeek-V3.2 TP+MTP accuracy test
 register_amd_ci(
-    est_time=3600,
+    est_time=5400,
     suite="nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp",
     nightly=True,
 )
@@ -55,10 +54,15 @@ class TestDeepseekV32TPMTP(CustomTestCase):
     def setUpClass(cls):
         cls.model = DEEPSEEK_V32_MODEL_PATH
         cls.base_url = DEFAULT_URL_FOR_TEST
+        # Use same args as perf test (which passes successfully)
         other_args = [
             "--trust-remote-code",
             "--tp",
             "8",
+            "--nsa-prefill-backend",
+            "tilelang",
+            "--nsa-decode-backend",
+            "tilelang",
             "--speculative-algorithm",
             "EAGLE",
             "--speculative-num-steps",
@@ -67,19 +71,17 @@ def setUpClass(cls):
             "1",
             "--speculative-num-draft-tokens",
             "4",
-            "--mem-frac",
+            "--mem-fraction-static",
             "0.7",
             "--model-loader-extra-config",
             '{"enable_multithread_load": true}',
-            "--nsa-prefill-backend",
-            "tilelang",
-            "--nsa-decode-backend",
-            "tilelang",
+            "--watchdog-timeout",
+            "1200",
         ]
         cls.process = popen_launch_server(
             cls.model,
             cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            timeout=5400,
             other_args=other_args,
         )
 
@@ -97,8 +99,8 @@ def test_a_gsm8k(self):
         args = SimpleNamespace(
             num_shots=20,
             data_path=None,
-            num_questions=1400,
-            parallel=1400,
+            num_questions=200,
+            parallel=64,
             max_new_tokens=512,
             host="http://127.0.0.1",
             port=int(self.base_url.split(":")[-1]),
diff --git a/test/registered/amd/accuracy/mi35x/test_gpt_oss_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_gpt_oss_eval_mi35x.py
index 548af0304a7b..4c2f8861ef3a 100644
--- a/test/registered/amd/accuracy/mi35x/test_gpt_oss_eval_mi35x.py
+++ b/test/registered/amd/accuracy/mi35x/test_gpt_oss_eval_mi35x.py
@@ -75,9 +75,7 @@ def __post_init__(self):
             "triton",
             "--trust-remote-code",
         ],
-        env_vars={
-            "SGLANG_USE_AITER": "0"
-        },  # Disabled due to SWA eviction bug with aiter (#17220)
+        env_vars={"SGLANG_USE_AITER": "1"},
     ),
     ModelConfig(
         model_path="openai/gpt-oss-120b",
@@ -95,9 +93,7 @@ def __post_init__(self):
             "triton",
             "--trust-remote-code",
         ],
-        env_vars={
-            "SGLANG_USE_AITER": "0"
-        },  # Disabled due to SWA eviction bug with aiter (#17220)
+        env_vars={"SGLANG_USE_AITER": "1"},
     ),
 ]
 
diff --git a/test/registered/amd/perf/mi35x/test_deepseek_v32_basic_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_deepseek_v32_basic_perf_mi35x.py
index 54abe22f390e..740500e9f5eb 100644
--- a/test/registered/amd/perf/mi35x/test_deepseek_v32_basic_perf_mi35x.py
+++ b/test/registered/amd/perf/mi35x/test_deepseek_v32_basic_perf_mi35x.py
@@ -115,6 +115,7 @@ def test_bench_one_batch(self):
                 variant=self.variant_config["name"],
                 extra_bench_args=["--trust-remote-code"],
                 enable_profile=False,  # Disable profiling for AMD tests
+                timeout=5400,  # Extended timeout for large model loading
             )
             results = result_tuple[0]
             success = result_tuple[1]
diff --git a/test/registered/layers/mamba/test_mamba_ssm_ssd.py b/test/registered/layers/mamba/test_mamba_ssm_ssd.py
index 43a4f1f47e5e..f6191d0bf277 100644
--- a/test/registered/layers/mamba/test_mamba_ssm_ssd.py
+++ b/test/registered/layers/mamba/test_mamba_ssm_ssd.py
@@ -5,6 +5,7 @@
 
 # Adapted from https://github.com/vllm-project/vllm/blob/633f943e30a4444d890d26b81850f7217736f840/tests/kernels/mamba/test_mamba_ssm_ssd.py
 
+import os
 
 import pytest
 import torch
@@ -13,8 +14,12 @@
 
 from sglang.srt.layers.attention.mamba.mamba2_metadata import Mamba2Metadata
 from sglang.srt.layers.attention.mamba.ops import mamba_chunk_scan_combined
+from sglang.srt.utils.common import is_hip
 from sglang.utils import is_in_ci
 
+if is_hip():
+    os.environ["AMDGCN_USE_BUFFER_OPS"] = "0"
+
 # Added by the IBM Team, 2024
 
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/modules/ssd_minimal.py
diff --git a/test/registered/rl/test_update_weights_from_distributed.py b/test/registered/rl/test_update_weights_from_distributed.py
index 0f5c126ba3ed..42e3a28aea05 100644
--- a/test/registered/rl/test_update_weights_from_distributed.py
+++ b/test/registered/rl/test_update_weights_from_distributed.py
@@ -37,6 +37,7 @@
     DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
     DEFAULT_URL_FOR_TEST,
     CustomTestCase,
+    is_in_amd_ci,
     is_in_ci,
     popen_launch_server,
 )
@@ -64,6 +65,60 @@ def verify_params_not_close(params1, params2, error_msg):
     assert not np.allclose(np.array(params1), np.array(params2)), error_msg
 
 
+def _warmup_broadcast(
+    hf_base_model,
+    state_dict_key_to_shape,
+    tie_word_embeddings,
+    load_format,
+    group,
+):
+    """Run one broadcast round to warm up RCCL before timing."""
+    broadcast_parameters = list(state_dict_key_to_shape.keys())
+    if tie_word_embeddings:
+        broadcast_parameters.remove("lm_head.weight")
+
+    if load_format == "flattened_bucket":
+        named_tensors = [
+            (name, hf_base_model.get_parameter(name)) for name in broadcast_parameters
+        ]
+        bucket = FlattenedTensorBucket(named_tensors=named_tensors)
+        flattened_tensor = bucket.get_flattened_tensor()
+        torch.distributed.broadcast(flattened_tensor, src=0, group=group)
+    else:
+        for name in broadcast_parameters:
+            torch.distributed.broadcast(
+                hf_base_model.get_parameter(name),
+                src=0,
+                group=group,
+            )
+
+
+def _warmup_update(
+    backend, engine, url, names, dtypes, shapes, load_format, pause_generation_mode
+):
+    """Run one update round to warm up RCCL before timing."""
+    if backend == "Engine":
+        engine.update_weights_from_distributed(
+            names,
+            dtypes=dtypes,
+            shapes=shapes,
+            group_name="test_parameter_update_group",
+            load_format=load_format,
+        )
+    else:
+        requests.post(
+            f"{url}/update_weights_from_distributed",
+            json={
+                "names": names,
+                "dtypes": dtypes,
+                "shapes": shapes,
+                "group_name": "test_parameter_update_group",
+                "load_format": load_format,
+                "flush_cache": not (pause_generation_mode == "in_place"),
+            },
+        )
+
+
 def init_process(
     rank,
     world_size,
@@ -180,6 +235,18 @@ def init_process_hf(
     )
     torch.cuda.synchronize()
     barrier.wait()
+
+    # Warmup: trigger RCCL initialization so it's excluded from timing
+    if is_in_amd_ci():
+        _warmup_broadcast(
+            hf_base_model,
+            state_dict_key_to_shape,
+            tie_word_embeddings,
+            load_format,
+            group,
+        )
+        torch.cuda.synchronize()
+
     time_begin_broadcast = time.perf_counter()
 
     # The last parameter is lm_head.weight, which is tied
@@ -354,6 +421,21 @@ def run_decode(max_new_tokens=32):
         )
     torch.cuda.synchronize()
     barrier.wait()
+
+    # Warmup: trigger RCCL initialization so it's excluded from timing
+    if is_in_amd_ci():
+        _warmup_update(
+            backend,
+            engine if backend == "Engine" else None,
+            url if backend != "Engine" else None,
+            names,
+            dtypes,
+            shapes,
+            load_format,
+            pause_generation_mode,
+        )
+        torch.cuda.synchronize()
+
     time_begin_update = time.perf_counter()
     if backend == "Engine":
         engine.update_weights_from_distributed(
diff --git a/test/run_suite.py b/test/run_suite.py
index 313eed48e196..9a88992342f8 100644
--- a/test/run_suite.py
+++ b/test/run_suite.py
@@ -120,8 +120,10 @@ def auto_partition(files: List[CIRegistry], rank, size):
     if not files or size <= 0:
         return []
 
-    # Sort files by estimated_time in descending order (LPT heuristic)
-    sorted_files = sorted(files, key=lambda f: f.est_time, reverse=True)
+    # Sort files by estimated_time in descending order (LPT heuristic).
+    # Use filename as tie-breaker to ensure deterministic partitioning
+    # regardless of glob ordering.
+    sorted_files = sorted(files, key=lambda f: (-f.est_time, f.filename))
 
     partitions = [[] for _ in range(size)]
     partition_sums = [0.0] * size