diff --git a/.github/workflows/nightly-test-amd-rocm720.yml b/.github/workflows/nightly-test-amd-rocm720.yml
new file mode 100644
index 000000000000..5e9fe8d1ce8d
--- /dev/null
+++ b/.github/workflows/nightly-test-amd-rocm720.yml
@@ -0,0 +1,789 @@
+name: Nightly Test (AMD ROCm 7.2)
+
+on: []
+#  pull_request:
+#    branches:
+#      - main
+#    paths:
+#      - "docker/rocm720.Dockerfile"
+#      - "scripts/ci/amd/amd_ci_start_container.sh"
+#      - ".github/workflows/nightly-test-amd-rocm720.yml"
+#  workflow_dispatch:
+#    inputs:
+#      job_filter:
+#        description: 'Select which job to run (leave empty or "all" to run all jobs)'
+#        required: false
+#        type: choice
+#        default: 'all'
+#        options:
+#          - 'all'
+#          # MI30x ROCm 7.2 Tests
+#          - 'nightly-test-1-gpu-unit-rocm720'
+#          # MI30x Accuracy Tests (GSM8K / MMMU)
+#          - 'nightly-accuracy-2-gpu-rocm720'
+#          - 'nightly-accuracy-2-gpu-vlm-rocm720'
+#          - 'nightly-perf-2-gpu-text-rocm720'
+#          - 'nightly-perf-2-gpu-vlm-rocm720'
+#          - 'nightly-accuracy-8-gpu-rocm720'
+#          # MI30x Accuracy + Performance Tests (combined)
+#          - 'nightly-8-gpu-grok1-int4-rocm720'
+#          - 'nightly-8-gpu-grok2-rocm720'
+#          - 'nightly-8-gpu-deepseek-v31-rocm720'
+#          - 'nightly-8-gpu-deepseek-v32-rocm720'
+#          - 'nightly-8-gpu-deepseek-v32-mtp-rocm720'
+#          - 'nightly-8-gpu-kimi-k2-rocm720'
+#          # MI35x jobs
+#          - 'nightly-test-1-gpu-mi35x-rocm720'
+#          - 'nightly-accuracy-8-gpu-mi35x-rocm720'
+#          - 'nightly-8-gpu-mi35x-grok1-int4-rocm720'
+#          - 'nightly-8-gpu-mi35x-grok2-rocm720'
+#          - 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720'
+#          - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720'
+#          - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720'
+#          - 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720'
+#          - 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720'
+#      ref:
+#        description: 'Git ref (branch, tag, or SHA) to test'
+#        required: false
+#        type: string
+#        default: ''
+
+concurrency:
+  group: nightly-test-amd-rocm720-${{ inputs.ref || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # ============================================== MI30x ROCm 7.2 Unit Tests ==============================================
+  # 1-GPU Unit Tests - LoRA, debug utils, scheduler, etc. (MI30x ROCm 7.2)
+  nightly-test-1-gpu-unit-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-unit-rocm720'
+    runs-on: linux-mi325-gpu-1
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Nightly Unit Test ROCm 7.2 (1-GPU)
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # ============================================== MI30x ROCm 7.2 Accuracy Tests ==============================================
+  # 2-GPU Accuracy Tests - GSM8K eval (MI30x ROCm 7.2)
+  nightly-accuracy-2-gpu-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-rocm720'
+    runs-on: linux-mi325-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Nightly Test ROCm 7.2 (2-GPU)
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 2-GPU VLM Accuracy Tests - Vision-Language Models MMMU evaluation (ROCm 7.2)
+  nightly-accuracy-2-gpu-vlm-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-vlm-rocm720'
+    runs-on: linux-mi325-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Nightly Accuracy Test ROCm 7.2 (2-GPU VLM MMMU)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 2-GPU Text Models Performance Tests (ROCm 7.2)
+  nightly-perf-2-gpu-text-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-text-rocm720'
+    runs-on: linux-mi325-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Performance Test ROCm 7.2 (2-GPU Text Models)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e SGLANG_USE_AITER=1 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 2-GPU VLM Performance Tests (ROCm 7.2)
+  nightly-perf-2-gpu-vlm-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-vlm-rocm720'
+    runs-on: linux-mi325-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Performance Test ROCm 7.2 (2-GPU VLM Models)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e SGLANG_USE_AITER=1 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU Accuracy Tests - GPT-OSS, Grok1-FP8 (ROCm 7.2)
+  nightly-accuracy-8-gpu-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-rocm720'
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU GPT-OSS)
+        timeout-minutes: 180
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU Grok1-FP8)
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # ============================================== MI30x ROCm 7.2 Combined Accuracy + Performance Tests ==============================================
+  # 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-grok1-int4-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok1-int4-rocm720'
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU Grok1-INT4)
+        timeout-minutes: 60
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test ROCm 7.2 (8-GPU Grok1-INT4)
+        timeout-minutes: 60
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-grok2-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok2-rocm720'
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU Grok2)
+        timeout-minutes: 60
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test ROCm 7.2 (8-GPU Grok2)
+        timeout-minutes: 60
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU DeepSeek-V3.1 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-deepseek-v31-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720'
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.1)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e SGLANG_USE_AITER=1 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.1)
+        timeout-minutes: 300
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e SGLANG_USE_ROCM700A=1 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU DeepSeek-V3.2 (Basic Accuracy + Perf)
+  nightly-8-gpu-deepseek-v32-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720'
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic)
+        timeout-minutes: 150
+        continue-on-error: true  # Perf test failure doesn't fail the job if accuracy passed
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU DeepSeek-V3.2 MTP (MTP Accuracy + Perf)
+  nightly-8-gpu-deepseek-v32-mtp-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720'
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker ROCm 7.2
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP)
+        timeout-minutes: 180
+        continue-on-error: true  # Perf test failure doesn't fail the job if accuracy passed
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # 8-GPU Kimi-K2 (Accuracy + Speed)
+  nightly-8-gpu-kimi-k2-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720'
+    runs-on: linux-mi325-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker ROCm 7.2
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test ROCm 7.2 (8-GPU Kimi-K2)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # ============================================== MI35x ROCm 7.2 Tests ==============================================
+  # MI35x 1-GPU ROCm 7.2 tests - builds from Dockerfile with gfx950-rocm720
+  nightly-test-1-gpu-mi35x-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-mi35x-rocm720'
+    runs-on: linux-mi35x-gpu-1
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Test MI35x ROCm 7.2 (1-GPU)
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 600 --continue-on-error || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU Accuracy Tests - GPT-OSS (accuracy only)
+  nightly-accuracy-8-gpu-mi35x-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-rocm720'
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU GPT-OSS)
+        timeout-minutes: 180
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-mi35x-grok1-int4-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok1-int4-rocm720'
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU Grok1-INT4)
+        timeout-minutes: 60
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU Grok1-INT4)
+        timeout-minutes: 60
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-mi35x-grok2-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok2-rocm720'
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU Grok2)
+        timeout-minutes: 60
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU Grok2)
+        timeout-minutes: 60
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e RCCL_MSCCL_ENABLE=0 \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-R1-MXFP4 (Accuracy + Performance) ROCm 7.2
+  nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720'
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4)
+        timeout-minutes: 300
+        continue-on-error: true
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_perf_mi35x.py || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-V3.2 Accuracy Test (ROCm 7.2)
+  nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720'
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-V3.2 TP+MTP Accuracy Test
+  nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720'
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Accuracy Test ROCm 7.2 MI35x (8-GPU DeepSeek-V3.2 TP+MTP)
+        timeout-minutes: 120
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-V3.2 Performance Test (Basic) ROCm 7.2
+  nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720'
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic)
+        timeout-minutes: 150
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP) ROCm 7.2
+  nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720:
+    if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720'
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker (ROCm 7.2)
+        timeout-minutes: 120
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP)
+        timeout-minutes: 150
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
+  check-all-jobs:
+    if: always()
+    needs:
+      # MI30x ROCm 7.2 Tests
+      - nightly-test-1-gpu-unit-rocm720
+      - nightly-accuracy-2-gpu-rocm720
+      - nightly-accuracy-2-gpu-vlm-rocm720
+      # MI30x Performance Tests
+      - nightly-perf-2-gpu-text-rocm720
+      - nightly-perf-2-gpu-vlm-rocm720
+      - nightly-accuracy-8-gpu-rocm720
+      - nightly-8-gpu-grok1-int4-rocm720
+      - nightly-8-gpu-grok2-rocm720
+      - nightly-8-gpu-deepseek-v31-rocm720
+      - nightly-8-gpu-deepseek-v32-rocm720
+      - nightly-8-gpu-deepseek-v32-mtp-rocm720
+      - nightly-8-gpu-kimi-k2-rocm720
+      # MI35x jobs
+      - nightly-test-1-gpu-mi35x-rocm720
+      - nightly-accuracy-8-gpu-mi35x-rocm720
+      - nightly-8-gpu-mi35x-grok1-int4-rocm720
+      - nightly-8-gpu-mi35x-grok2-rocm720
+      - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720
+      - nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720
+      - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720
+      - nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720
+      - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check if any job failed
+        run: |
+          if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then
+            echo "One or more ROCm 7.2 test jobs failed"
+            exit 1
+          fi
+          if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
+            echo "One or more ROCm 7.2 test jobs were cancelled"
+            exit 1
+          fi
+          echo "All ROCm 7.2 test jobs passed"
diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml
new file mode 100644
index 000000000000..5ca23d7dc861
--- /dev/null
+++ b/.github/workflows/pr-test-amd-rocm720.yml
@@ -0,0 +1,944 @@
+name: PR Test ROCm 7.2 (AMD)
+# Dynamic run-name for /rerun-stage commands to enable URL lookup
+# Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs
+run-name: ${{ inputs.target_stage && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage)) || '' }}
+
+on: []
+#  push:
+#    branches: [ main ]
+#    paths:
+#      - "python/**"
+#      - "scripts/ci/**"
+#      - "test/**"
+#      - "sgl-kernel/**"
+#      - ".github/workflows/pr-test-amd.yml"
+#      - "docker/rocm.Dockerfile"
+#  pull_request:
+#    branches: [ main ]
+#    paths:
+#      - "python/**"
+#      - "scripts/ci/**"
+#      - "test/**"
+#      - "sgl-kernel/**"
+#      - ".github/workflows/pr-test-amd.yml"
+#      - "docker/rocm.Dockerfile"
+#  workflow_dispatch:
+#    inputs:
+#      target_stage:
+#        description: "Specific stage to run (optional, for quick testing)"
+#        required: false
+#        type: string
+#        default: ""
+#      pr_head_sha:
+#        description: "PR head SHA to checkout (for /rerun-stage on fork PRs)"
+#        required: false
+#        type: string
+#        default: ""
+#  workflow_call:
+#    inputs:
+#      ref:
+#        description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
+#        required: false
+#        type: string
+#        default: ''
+#      run_all_tests:
+#        description: "Run all tests (for releasing or testing purpose)"
+#        required: false
+#        type: boolean
+#        default: false
+
+concurrency:
+  # Include pr_head_sha in group for /rerun-stage dispatches to avoid collisions with main branch runs
+  group: pr-test-amd-${{ inputs.pr_head_sha || inputs.ref || github.ref }}
+  cancel-in-progress: ${{ github.event_name != 'workflow_call' }}
+
+jobs:
+  call-gate:
+    uses: ./.github/workflows/pr-gate.yml
+    secrets: inherit
+  check-changes:
+    needs: [call-gate]
+    runs-on: ubuntu-latest
+    outputs:
+      main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }}
+      sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }}
+      multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Determine run mode
+        id: run-mode
+        run: |
+          # Run all tests for workflow_call (when ref input is provided)
+          # Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref
+          if [[ "${{ inputs.run_all_tests }}" == "true" ]]; then
+            echo "run_all_tests=true" >> $GITHUB_OUTPUT
+            echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }})"
+          else
+            echo "run_all_tests=false" >> $GITHUB_OUTPUT
+            echo "Run mode: FILTERED (triggered by ${{ github.event_name }})"
+          fi
+
+      - name: Detect file changes
+        id: filter
+        uses: dorny/paths-filter@v3
+        if: steps.run-mode.outputs.run_all_tests != 'true'
+        with:
+          filters: |
+            main_package:
+              - "python/sglang/!(multimodal_gen)/**"
+              - "python/pyproject_rocm.toml"
+              - "python/pyproject_other.toml"
+              - "scripts/ci/amd/*"
+              - "scripts/ci/utils/*"
+              - "test/**"
+              - ".github/workflows/pr-test-amd.yml"
+            sgl_kernel:
+              - "sgl-kernel/**"
+              - ".github/workflows/pr-test-amd.yml"
+            multimodal_gen:
+              - "python/sglang/multimodal_gen/**"
+              - "python/sglang/cli/**"
+              - "python/pyproject_rocm.toml"
+              - "python/pyproject_other.toml"
+
+  # =============================================== sgl-kernel ====================================================
+  sgl-kernel-unit-test-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'sgl-kernel-unit-test-amd') ||
+        (
+          !inputs.target_stage &&
+          needs.check-changes.outputs.sgl_kernel == 'true'
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Run test
+        timeout-minutes: 14
+        run: |
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests/sgl_diffusion ci_sglang python3 -m pytest test_timestep_embedding.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py
+
+  sgl-kernel-unit-test-2-gpu-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'sgl-kernel-unit-test-2-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          needs.check-changes.outputs.sgl_kernel == 'true'
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Run test
+        timeout-minutes: 20
+        run: |
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_deterministic_custom_allreduce.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_nccl_allreduce_determinism.py
+
+  # =============================================== primary ====================================================
+
+  stage-a-test-1-amd:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-a-test-1-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Run test
+        timeout-minutes: 10
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd
+
+  stage-b-test-small-1-gpu-amd:
+    needs: [check-changes, stage-a-test-1-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+        part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl
+          docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true
+          docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+          docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true
+          docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800
+
+  stage-b-test-small-1-gpu-amd-mi35x:
+    needs: [check-changes, stage-a-test-1-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-mi35x') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi35x-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl
+          docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true
+          docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+          docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true
+          docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x
+
+  stage-b-test-large-2-gpu-amd:
+    needs: [check-changes, stage-a-test-1-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-large-2-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl
+          docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true
+          docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+          docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true
+          docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd
+
+  multimodal-gen-test-1-gpu-amd:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.multimodal_gen == 'true'
+    strategy:
+      fail-fast: false
+      max-parallel: 1  # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT
+      matrix:
+        runner: [linux-mi325-gpu-1]
+        part: [0, 1]  # 2 partitions: 11 tests ÷ 2 = ~5-6 tests each
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Download artifacts
+        if: needs.check-changes.outputs.sgl_kernel == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Setup kernel caches
+        run: |
+          # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data)
+          # This directory persists across container restarts on the self-hosted runner
+          docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub
+
+          # Clear pre-built AITER kernels from Docker image to avoid segfaults
+          # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/
+          echo "Clearing pre-built AITER kernels from Docker image..."
+          docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true
+          docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true
+          echo "AITER kernels cleared - will be rebuilt on first use"
+
+          # Create persistent cache marker if /sgl-data is a real mount (not ephemeral)
+          # This tells the test cleanup code to NOT delete downloaded models
+          if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then
+            docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache
+            echo "Created .persistent_cache marker - HF cache will persist"
+          else
+            echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test"
+          fi
+
+          # Check MIOpen cache (VAE convolution kernels)
+          miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0")
+          echo "Found ${miopen_files} MIOpen cache files"
+
+      - name: Diagnose HF cache and system resources
+        run: |
+          echo "=== System Memory Status ==="
+          free -h
+          echo ""
+          echo "=== Disk Space ==="
+          df -h /home/runner/sgl-data 2>/dev/null || df -h
+          echo ""
+          echo "=== HF Cache Directory Structure ==="
+          docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found"
+          docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found"
+          echo ""
+          echo "=== Checking for cached diffusion models (1-GPU tests) ==="
+          # Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2
+          for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do
+            cache_path="/sgl-data/hf-cache/hub/models--${model}"
+            if docker exec ci_sglang test -d "$cache_path"; then
+              size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1)
+              echo "✓ CACHED: $model ($size)"
+            else
+              echo "✗ NOT CACHED: $model"
+            fi
+          done
+          echo ""
+          echo "=== GPU Memory Status ==="
+          docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available"
+
+      - name: Run diffusion server tests (1-GPU)
+        timeout-minutes: 45
+        run: |
+          # AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path)
+          # Tests: T2V, T2I, I2V, LoRA
+          #
+          # HF download env vars:
+          # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available)
+          # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings
+          docker exec \
+            -e SGLANG_E2E_TOLERANCE=0.3 \
+            -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \
+            -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \
+            -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \
+            -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \
+            -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \
+            -e AITER_JIT_DIR=/sgl-data/aiter-kernels \
+            -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
+            -e HF_HUB_ENABLE_HF_TRANSFER=1 \
+            -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
+            -w /sglang-checkout/python \
+            ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \
+              --suite 1-gpu \
+              --partition-id ${{ matrix.part }} \
+              --total-partitions 2 \
+              -k "not flux_2"
+
+          # Post-test diagnostics
+          echo "=== Post-test System Memory Status ==="
+          free -h
+
+  multimodal-gen-test-2-gpu-amd:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.multimodal_gen == 'true'
+    strategy:
+      fail-fast: false
+      max-parallel: 1  # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT
+      matrix:
+        runner: [linux-mi325-gpu-2]
+        part: [0, 1]  # 2 partitions: 9 tests ÷ 2 = ~4-5 tests each
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Download artifacts
+        if: needs.check-changes.outputs.sgl_kernel == 'true'
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Setup kernel caches
+        run: |
+          # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data)
+          docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub
+
+          # Clear pre-built AITER kernels from Docker image to avoid segfaults
+          # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/
+          echo "Clearing pre-built AITER kernels from Docker image..."
+          docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true
+          docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true
+          echo "AITER kernels cleared - will be rebuilt on first use"
+
+          # Create persistent cache marker if /sgl-data is a real mount (not ephemeral)
+          # This tells the test cleanup code to NOT delete downloaded models
+          if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then
+            docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache
+            echo "Created .persistent_cache marker - HF cache will persist"
+          else
+            echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test"
+          fi
+
+          # Check MIOpen cache (VAE convolution kernels)
+          miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0")
+          echo "Found ${miopen_files} MIOpen cache files"
+
+      - name: Diagnose HF cache and system resources
+        run: |
+          echo "=== System Memory Status ==="
+          free -h
+          echo ""
+          echo "=== Disk Space ==="
+          df -h /home/runner/sgl-data 2>/dev/null || df -h
+          echo ""
+          echo "=== HF Cache Directory Structure ==="
+          docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found"
+          docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found"
+          echo ""
+          echo "=== Checking for cached diffusion models (2-GPU tests) ==="
+          # Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1
+          for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do
+            cache_path="/sgl-data/hf-cache/hub/models--${model}"
+            if docker exec ci_sglang test -d "$cache_path"; then
+              size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1)
+              echo "✓ CACHED: $model ($size)"
+            else
+              echo "✗ NOT CACHED: $model"
+            fi
+          done
+          echo ""
+          echo "=== GPU Memory Status ==="
+          docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available"
+
+      - name: Run diffusion server tests (2-GPU)
+        timeout-minutes: 80
+        run: |
+          # AMD CI: All 2-GPU tests including LoRA
+          # Tests: T2V, T2I, I2V, LoRA
+          #
+          # HF download env vars:
+          # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available)
+          # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings
+          docker exec \
+            -e SGLANG_E2E_TOLERANCE=0.3 \
+            -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \
+            -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \
+            -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \
+            -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \
+            -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \
+            -e AITER_JIT_DIR=/sgl-data/aiter-kernels \
+            -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
+            -e HF_HUB_ENABLE_HF_TRANSFER=1 \
+            -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
+            -w /sglang-checkout/python \
+            ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \
+              --suite 2-gpu \
+              --partition-id ${{ matrix.part }} \
+              --total-partitions 2
+
+          # Post-test diagnostics
+          echo "=== Post-test System Memory Status ==="
+          free -h
+
+
+  stage-c-test-large-8-gpu-amd:
+    needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-c-test-large-8-gpu-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    env:
+      RUNNER_LABELS: linux-mi325-gpu-8
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-8]
+        part: [0, 1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Test RCCL multi-GPU communication
+        timeout-minutes: 5
+        run: |
+          echo "Testing RCCL multi-GPU communication with debug info..."
+          docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py"
+
+      - name: Run test
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 3600
+
+  stage-c-test-large-8-gpu-amd-mi35x:
+    needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-c-test-large-8-gpu-amd-mi35x') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi35x-gpu-8]
+        part: [0, 1, 2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Run test
+        timeout-minutes: 60
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600
+
+  stage-b-test-small-1-gpu-performance-amd:
+    needs: [check-changes, call-gate, stage-a-test-1-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-small-1-gpu-performance-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl
+          docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true
+          docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+          docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true
+          docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-performance-amd --timeout-per-file 1200
+
+  stage-b-test-large-1-gpu-performance-amd:
+    needs: [check-changes, call-gate, stage-a-test-1-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-large-1-gpu-performance-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+        part: [0, 1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl
+          docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true
+          docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+          docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true
+          docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-performance-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1200
+
+  stage-b-test-large-2-gpu-performance-amd:
+    needs: [check-changes, call-gate, stage-a-test-1-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-large-2-gpu-performance-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl
+          docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true
+          docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+          docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true
+          docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-performance-amd --timeout-per-file 1200
+
+  stage-b-test-small-1-gpu-accuracy-amd:
+    needs: [check-changes, stage-a-test-1-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-small-1-gpu-accuracy-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl
+          docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true
+          docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+          docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true
+          docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" -e SGLANG_USE_AITER=0 python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-accuracy-amd --timeout-per-file 1800
+
+  stage-b-test-large-2-gpu-accuracy-amd:
+    needs: [check-changes, stage-a-test-1-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-large-2-gpu-accuracy-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl
+          docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true
+          docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+          docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true
+          docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" -e SGLANG_USE_AITER_AR=0 -e SGLANG_USE_AITER=0 -e HF_HUB_ENABLE_HF_TRANSFER=0 python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-accuracy-amd --timeout-per-file 1800
+
+  pr-test-amd-finish:
+    needs:
+      [
+        call-gate,
+        check-changes,
+
+        sgl-kernel-unit-test-amd,
+        sgl-kernel-unit-test-2-gpu-amd,
+        multimodal-gen-test-1-gpu-amd,
+        multimodal-gen-test-2-gpu-amd,
+
+        stage-a-test-1-amd,
+        stage-b-test-small-1-gpu-amd,
+        stage-b-test-small-1-gpu-amd-mi35x,
+        stage-b-test-large-2-gpu-amd,
+        stage-b-test-small-1-gpu-performance-amd,
+        stage-b-test-large-1-gpu-performance-amd,
+        stage-b-test-large-2-gpu-performance-amd,
+        stage-b-test-small-1-gpu-accuracy-amd,
+        stage-b-test-large-2-gpu-accuracy-amd,
+        stage-c-test-large-8-gpu-amd,
+        stage-c-test-large-8-gpu-amd-mi35x,
+      ]
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check all dependent job statuses
+        run: |
+          # Convert the 'needs' context to a JSON string
+          json_needs='${{ toJson(needs) }}'
+
+          # Get a list of all job names from the JSON keys
+          job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
+
+          for job in $job_names; do
+            # For each job, extract its result
+            result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
+
+            # Print the job name and its result
+            echo "$job: $result"
+
+            # Check for failure or cancellation and exit if found
+            if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
+              echo "The above jobs failed."
+              exit 1
+            fi
+          done
+
+          # If the loop completes, all jobs were successful
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index a2e8e9988bfd..d701c6fe97e0 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -14,7 +14,7 @@ on:
       - ".github/workflows/pr-test-amd.yml"
       - "docker/rocm.Dockerfile"
   pull_request:
-    branches: [ main ]
+    branches: [ dont-trigger-this-one-anyway ]
     paths:
       - "python/**"
       - "scripts/ci/**"
diff --git a/.github/workflows/release-docker-amd-rocm720-preview.yml b/.github/workflows/release-docker-amd-rocm720-preview.yml
new file mode 100644
index 000000000000..bcf01907807a
--- /dev/null
+++ b/.github/workflows/release-docker-amd-rocm720-preview.yml
@@ -0,0 +1,83 @@
+name: Release Docker Images ROCm 7.2.0 Preview (AMD)
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "docker/rocm720.Dockerfile"
+      - ".github/workflows/release-docker-amd-rocm720-preview.yml"
+  push:
+    tags:
+      - 'v[0-9]+.*'
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version to build (without v prefix, e.g., 0.5.7)'
+        required: true
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: amd-docker-scale
+    environment: 'prod'
+    strategy:
+      fail-fast: false
+      matrix:
+        gpu_arch: ['gfx942-rocm720', 'gfx950-rocm720']
+        build_type: ['all']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Required for git describe to find tags
+
+      - name: "Set Date"
+        run: |
+          echo "DATE=$(date +%Y%m%d)" >> $GITHUB_ENV
+
+      - name: Get version from latest tag
+        id: version
+        run: |
+          # Get the latest version tag sorted by version number (e.g., v0.5.7 -> 0.5.7)
+          VERSION=$(git tag -l 'v[0-9]*' --sort=-v:refname | head -1 | sed 's/^v//')
+
+          if [ -z "$VERSION" ]; then
+            echo "::error::Could not determine version from git tags"
+            exit 1
+          fi
+
+          echo "version=${VERSION}" >> $GITHUB_OUTPUT
+          echo "Detected version: ${VERSION}"
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_AMD_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_AMD_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=${{ steps.version.outputs.version }}
+          echo "Version: ${version}"
+
+          if [ "${{ matrix.gpu_arch }}" = "gfx942-rocm720" ]; then
+            rocm_tag="rocm720-mi30x"
+          elif [ "${{ matrix.gpu_arch }}" = "gfx950-rocm720" ]; then
+            rocm_tag="rocm720-mi35x"
+          else
+            echo "Unsupported gfx arch"
+            exit 1
+          fi
+
+          tag=v${version}-${rocm_tag}
+
+          docker build . -f docker/rocm720.Dockerfile --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} -t rocm/sgl-dev:${tag}-${{ env.DATE }}-preview --no-cache
+          docker push rocm/sgl-dev:${tag}-${{ env.DATE }}-preview
diff --git a/docker/aiter.patch b/docker/aiter.patch
new file mode 100644
index 000000000000..43187689435b
--- /dev/null
+++ b/docker/aiter.patch
@@ -0,0 +1,48 @@
+diff --git a/csrc/py_itfs_cu/asm_mla.cu b/csrc/py_itfs_cu/asm_mla.cu
+index 995364105..0adab889e 100644
+--- a/csrc/py_itfs_cu/asm_mla.cu
++++ b/csrc/py_itfs_cu/asm_mla.cu
+@@ -283,14 +283,14 @@ void mla_decode_stage1_asm_fwd(
+                 else if(max_seqlen_q <= 4)
+                 {
+                     // assert(false);
+-                    //sub_Q = 128;
+-                    //static AiterAsmKernel impl_fp8(
+-                    //    "_ZN5aiter36mla_a8w8_qh16_qseqlen4_gqaratio16_psE",
+-                    //    "/mla/mla_a8w8_qh16_qseqlen4_gqaratio16_ps.co");
+-                    sub_Q = 64;
++                    sub_Q = 128;
+                     static AiterAsmKernel impl_fp8(
+-                        "_ZN5aiter36mla_a8w8_qh64_qseqlen4_gqaratio16_psE",
+-                        "/mla/mla_a8w8_qh64_qseqlen4_gqaratio16_ps.co");
++                        "_ZN5aiter36mla_a8w8_qh16_qseqlen4_gqaratio16_psE",
++                        "/mla/mla_a8w8_qh16_qseqlen4_gqaratio16_ps.co");
++                    //sub_Q = 64;
++                    //static AiterAsmKernel impl_fp8(
++                    //    "_ZN5aiter36mla_a8w8_qh64_qseqlen4_gqaratio16_psE",
++                    //    "/mla/mla_a8w8_qh64_qseqlen4_gqaratio16_ps.co");
+                     impl_ptr = &impl_fp8;
+                 }
+                 else
+@@ -319,14 +319,14 @@ void mla_decode_stage1_asm_fwd(
+                 else if(max_seqlen_q <= 4)
+                 {
+                     // assert(false);
+-                    //sub_Q = 128;
+-                    //static AiterAsmKernel impl_fp8(
+-                    //    "_ZN5aiter33mla_a8w8_qh16_qseqlen4_gqaratio16E",
+-                    //    "/mla/mla_a8w8_qh16_qseqlen4_gqaratio16.co");
+-                    sub_Q = 64;
++                    sub_Q = 128;
+                     static AiterAsmKernel impl_fp8(
+-                        "_ZN5aiter33mla_a8w8_qh64_qseqlen4_gqaratio16E",
+-                        "/mla/mla_a8w8_qh64_qseqlen4_gqaratio16.co");
++                        "_ZN5aiter33mla_a8w8_qh16_qseqlen4_gqaratio16E",
++                        "/mla/mla_a8w8_qh16_qseqlen4_gqaratio16.co");
++                    //sub_Q = 64;
++                    //static AiterAsmKernel impl_fp8(
++                    //    "_ZN5aiter33mla_a8w8_qh64_qseqlen4_gqaratio16E",
++                    //    "/mla/mla_a8w8_qh64_qseqlen4_gqaratio16.co");
+                     impl_ptr = &impl_fp8;
+                 }
+                 else
diff --git a/docker/rocm720.Dockerfile b/docker/rocm720.Dockerfile
new file mode 100644
index 000000000000..b7eee9424215
--- /dev/null
+++ b/docker/rocm720.Dockerfile
@@ -0,0 +1,351 @@
+# ROCm 7.2 Dockerfile for SGLang (copied from akao-amd's rocm.Dockerfile for testing)
+# Usage:
+#   docker build --build-arg SGL_BRANCH=9409c4359 --build-arg GPU_ARCH=gfx942-rocm720 -t sglang:rocm720-mi30x -f docker/rocm720.Dockerfile .
+#   docker build --build-arg SGL_BRANCH=9409c4359 --build-arg GPU_ARCH=gfx950-rocm720 -t sglang:rocm720-mi35x -f docker/rocm720.Dockerfile .
+
+# Default base images
+ARG BASE_IMAGE_942="rocm/sgl-dev:vllm20250114"
+ARG BASE_IMAGE_942_ROCM700="rocm/sgl-dev:rocm7-vllm-20250904"
+ARG BASE_IMAGE_942_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1"
+ARG BASE_IMAGE_950="rocm/sgl-dev:rocm7-vllm-20250904"
+ARG BASE_IMAGE_950_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1"
+
+# This is necessary for scope purpose
+ARG GPU_ARCH=gfx950
+
+# ===============================
+# Base image 942 with rocm630 and args
+FROM $BASE_IMAGE_942 AS gfx942
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="1"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="1"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.4"
+
+# ===============================
+# Base image 942 with rocm700 and args
+FROM $BASE_IMAGE_942_ROCM700 AS gfx942-rocm700
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="0"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="1"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.9.post1"
+
+# ===============================
+# Base image 942 with rocm720 and args
+FROM $BASE_IMAGE_942_ROCM720 AS gfx942-rocm720
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="1"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="1"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.9.post1"
+
+# ===============================
+# Base image 950 and args
+FROM $BASE_IMAGE_950 AS gfx950
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="0"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="0"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.9.post1"
+
+# ===============================
+# Base image 950 with rocm720 and args
+FROM $BASE_IMAGE_950_ROCM720 AS gfx950-rocm720
+ENV BUILD_VLLM="0"
+ENV BUILD_TRITON="1"
+ENV BUILD_LLVM="0"
+ENV BUILD_AITER_ALL="0"
+ENV BUILD_MOONCAKE="1"
+ENV AITER_COMMIT="v0.1.9.post1"
+
+# ===============================
+# Chosen arch and args
+FROM ${GPU_ARCH}
+
+# This is necessary for scope purpose, again
+ARG GPU_ARCH=gfx950
+ENV GPU_ARCH_LIST=${GPU_ARCH%-*}
+
+ARG SGL_REPO="https://github.com/sgl-project/sglang.git"
+ARG SGL_DEFAULT="main"
+ARG SGL_BRANCH=${SGL_DEFAULT}
+
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+ARG TRITON_COMMIT="42270451990532c67e69d753fbd026f28fcc4840"
+
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"
+
+ARG LLVM_REPO="https://github.com/jrbyrnes/llvm-project.git"
+ARG LLVM_BRANCH="MainOpSelV2"
+ARG LLVM_COMMIT="6520ace8227ffe2728148d5f3b9872a870b0a560"
+
+ARG MOONCAKE_REPO="https://github.com/kvcache-ai/Mooncake.git"
+ARG MOONCAKE_COMMIT="b6a841dc78c707ec655a563453277d969fb8f38d"
+
+ARG TILELANG_REPO="https://github.com/HaiShaw/tilelang.git"
+ARG TILELANG_BRANCH="dsv32-mi35x"
+ARG TILELANG_COMMIT="ae938cf885743f165a19656d1122ad42bb0e30b8"
+
+ARG FHT_REPO="https://github.com/jeffdaily/fast-hadamard-transform.git"
+ARG FHT_BRANCH="rocm"
+ARG FHT_COMMIT="46efb7d776d38638fc39f3c803eaee3dd7016bd1"
+USER root
+
+# Install some basic utilities
+RUN python -m pip install --upgrade pip && pip install setuptools_scm
+RUN apt-get purge -y sccache; python -m pip uninstall -y sccache; rm -f "$(which sccache)"
+
+WORKDIR /sgl-workspace
+
+# -----------------------
+# llvm
+RUN if [ "$BUILD_LLVM" = "1" ]; then \
+     ENV HIP_CLANG_PATH="/sgl-workspace/llvm-project/build/bin/" \
+     git clone --single-branch ${LLVM_REPO} -b ${LLVM_BRANCH} \
+     && cd llvm-project \
+     && git checkout ${LLVM_COMMIT} \
+     && mkdir build \
+     && cd build \
+     && cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm \
+     && make -j$(nproc); \
+    fi
+
+# -----------------------
+# AITER
+ENV MAX_JOBS=256
+RUN pip uninstall -y aiter
+RUN pip install psutil pybind11 # Required by AITER setup.py
+RUN git clone ${AITER_REPO} \
+ && cd aiter \
+ && sed -i setup.py -e 's/verbose.*/verbose=True,/' \
+ && git checkout ${AITER_COMMIT} \
+ && git submodule update --init --recursive
+ADD docker/aiter.patch ./aiter
+RUN cd aiter \
+     && echo "[AITER] GPU_ARCH=${GPU_ARCH}" \
+     && if [ "$BUILD_AITER_ALL" = "1" ] && [ "$BUILD_LLVM" = "1" ]; then \
+          sh -c "HIP_CLANG_PATH=/sgl-workspace/llvm-project/build/bin/ PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \
+        elif [ "$BUILD_AITER_ALL" = "1" ]; then \
+          sh -c "PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \
+        else \
+          sh -c "GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \
+        fi \
+     && sh -c "patch -p1 < ./aiter.patch;"
+
+# -----------------------
+# Build vLLM
+ARG VLLM_REPO="https://github.com/ROCm/vllm.git"
+ARG VLLM_BRANCH="9f6b92db47c3444b7a7d67451ba0c3a2d6af4c2c"
+RUN if [ "$BUILD_VLLM" = "1" ]; then \
+        git clone ${VLLM_REPO} \
+     && cd vllm \
+     && git checkout ${VLLM_BRANCH} \
+     && python -m pip install -r requirements/rocm.txt \
+     && python setup.py clean --all \
+     && python setup.py develop; \
+    fi
+
+# -----------------------
+# Build Mooncake
+ENV PATH=$PATH:/usr/local/go/bin
+
+RUN if [ "$BUILD_MOONCAKE" = "1" ]; then \
+     apt update && apt install -y zip unzip wget && \
+     apt install -y gcc make libtool autoconf  librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool  libibverbs-dev rdma-core && \
+     apt install -y openssh-server openmpi-bin openmpi-common libopenmpi-dev && \
+     git clone ${MOONCAKE_REPO} && \
+     cd Mooncake && \
+     git checkout ${MOONCAKE_COMMIT} && \
+     git submodule update --init --recursive && \
+     bash dependencies.sh -y && \
+     rm -rf /usr/local/go && \
+     wget https://go.dev/dl/go1.22.2.linux-amd64.tar.gz && \
+     tar -C /usr/local -xzf go1.22.2.linux-amd64.tar.gz && \
+     rm go1.22.2.linux-amd64.tar.gz && \
+     mkdir -p build && \
+     cd build && \
+     cmake .. -DUSE_HIP=ON -DUSE_ETCD=ON && \
+     make -j "$(nproc)" && make install; \
+    fi
+
+# -----------------------
+# Build SGLang
+ARG BUILD_TYPE=all
+
+RUN pip install IPython \
+    && pip install orjson \
+    && pip install python-multipart \
+    && pip install torchao==0.9.0 \
+    && pip install pybind11
+
+RUN pip uninstall -y sgl_kernel sglang
+ADD docker/sglang.patch ./sglang
+RUN git clone ${SGL_REPO} \
+    && cd sglang \
+    && patch -p1 < sglang.patch
+    && if [ "${SGL_BRANCH}" = ${SGL_DEFAULT} ]; then \
+         echo "Using ${SGL_DEFAULT}, default branch."; \
+         git checkout ${SGL_DEFAULT}; \
+       else \
+         echo "Using ${SGL_BRANCH} branch."; \
+         git checkout ${SGL_BRANCH}; \
+       fi \
+    && cd sgl-kernel \
+    && rm -f pyproject.toml \
+    && mv pyproject_rocm.toml pyproject.toml \
+    && AMDGPU_TARGET=$GPU_ARCH_LIST python setup_rocm.py install \
+    && cd .. \
+    && rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml \
+    && if [ "$BUILD_TYPE" = "srt" ]; then \
+         python -m pip --no-cache-dir install -e "python[srt_hip,diffusion_hip]"; \
+       else \
+         python -m pip --no-cache-dir install -e "python[all_hip,diffusion_hip]"; \
+       fi
+
+RUN python -m pip cache purge
+
+# Copy config files to support MI300X in virtualized environments (MI300X_VF).  Symlinks will not be created in image build.
+RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
+         /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
+         -type f -name '*MI300X*' | xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
+
+# Install Rust toolchain for sgl-model-gateway
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
+    && rustc --version && cargo --version
+
+# Build and install sgl-model-gateway
+RUN python3 -m pip install --no-cache-dir setuptools-rust \
+    && cd /sgl-workspace/sglang/sgl-model-gateway/bindings/python \
+    && cargo build --release \
+    && python3 -m pip install --no-cache-dir . \
+    && rm -rf /root/.cache
+
+# -----------------------
+# TileLang
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LIBGL_ALWAYS_INDIRECT=1
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+RUN /bin/bash -lc 'set -euo pipefail; \
+  # Build TileLang only for gfx950
+  if [ "${GPU_ARCH%-*}" != "gfx950" ]; then \
+    echo "[TileLang] Skipping (GPU_ARCH=${GPU_ARCH:-unset})"; \
+    exit 0; \
+  fi; \
+  echo "[TileLang] Building TileLang for ${GPU_ARCH}"; \
+  \
+  # System dependencies (NO llvm-dev to avoid llvm-config-16 shadowing)
+  apt-get update && apt-get install -y --no-install-recommends \
+      build-essential git wget curl ca-certificates gnupg \
+      libgtest-dev libgmock-dev \
+      libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev \
+      python3 python3-dev python3-setuptools python3-pip \
+      gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev \
+      cmake ninja-build pkg-config libstdc++6 \
+  && rm -rf /var/lib/apt/lists/*; \
+  \
+  # Build GoogleTest static libs (Ubuntu package ships sources only)
+  cmake -S /usr/src/googletest -B /tmp/build-gtest -DBUILD_GTEST=ON -DBUILD_GMOCK=ON -DCMAKE_BUILD_TYPE=Release && \
+  cmake --build /tmp/build-gtest -j"$(nproc)" && \
+  cp -v /tmp/build-gtest/lib/*.a /usr/lib/x86_64-linux-gnu/ && \
+  rm -rf /tmp/build-gtest; \
+  \
+  # Keep setuptools < 80 (compat with base image)
+  python3 -m pip install --upgrade "setuptools>=77.0.3,<80" wheel cmake ninja && \
+  python3 -m pip cache purge || true; \
+  \
+  # Locate ROCm llvm-config; fallback to installing LLVM 18 if missing
+  LLVM_CONFIG_PATH=""; \
+  for p in /opt/rocm/llvm/bin/llvm-config /opt/rocm/llvm-*/bin/llvm-config /opt/rocm-*/llvm*/bin/llvm-config; do \
+    if [ -x "$p" ]; then LLVM_CONFIG_PATH="$p"; break; fi; \
+  done; \
+  if [ -z "$LLVM_CONFIG_PATH" ]; then \
+    echo "[TileLang] ROCm llvm-config not found; installing LLVM 18..."; \
+    curl -fsSL https://apt.llvm.org/llvm.sh -o /tmp/llvm.sh; \
+    chmod +x /tmp/llvm.sh; \
+    /tmp/llvm.sh 18; \
+    LLVM_CONFIG_PATH="$(command -v llvm-config-18)"; \
+    if [ -z "$LLVM_CONFIG_PATH" ]; then echo "ERROR: llvm-config-18 not found after install"; exit 1; fi; \
+  fi; \
+  echo "[TileLang] Using LLVM_CONFIG at: $LLVM_CONFIG_PATH"; \
+  export PATH="$(dirname "$LLVM_CONFIG_PATH"):/usr/local/bin:${PATH}"; \
+  export LLVM_CONFIG="$LLVM_CONFIG_PATH"; \
+  \
+  # Optional shim for tools that expect llvm-config-16
+  mkdir -p /usr/local/bin && \
+  printf "#!/usr/bin/env bash\nexec \"%s\" \"\$@\"\n" "$LLVM_CONFIG_PATH" > /usr/local/bin/llvm-config-16 && \
+  chmod +x /usr/local/bin/llvm-config-16; \
+  \
+  # TVM Python bits need Cython
+  python3 -m pip install --no-cache-dir "cython>=0.29.36,<3.0"; \
+  \
+  # Clone + pin TileLang (bundled TVM), then build
+  git clone --recursive --branch "${TILELANG_BRANCH}" "${TILELANG_REPO}" /opt/tilelang && \
+  cd /opt/tilelang && \
+  git fetch --depth=1 origin "${TILELANG_COMMIT}" || true && \
+  git checkout -f "${TILELANG_COMMIT}" && \
+  git submodule update --init --recursive && \
+  export CMAKE_ARGS="-DLLVM_CONFIG=${LLVM_CONFIG} ${CMAKE_ARGS:-}" && \
+  bash ./install_rocm.sh'
+
+# -----------------------
+# Hadamard-transform (HIP build)
+RUN /bin/bash -lc 'set -euo pipefail; \
+    git clone --branch "${FHT_BRANCH}" "${FHT_REPO}" fast-hadamard-transform; \
+    cd fast-hadamard-transform; \
+    git checkout -f "${FHT_COMMIT}"; \
+    sed -i setup.py -e "/^.*torch\",$/d"; \
+    pip show torch; \
+    python setup.py install'
+
+# -----------------------
+# Python tools
+RUN python3 -m pip install --no-cache-dir \
+    py-spy \
+    pre-commit \
+    tabulate
+
+# -----------------------
+# Triton
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
+        pip uninstall -y triton \
+     && apt install -y cmake \
+     && git clone ${TRITON_REPO} triton-custom \
+     && cd triton-custom \
+     && git checkout ${TRITON_COMMIT} \
+     && pip install -r python/requirements.txt \
+     && pip install -e .; \
+    fi
+
+# -----------------------
+# Performance environment variable.
+
+# Skip CuDNN compatibility check - not applicable for ROCm (uses MIOpen instead)
+ENV SGLANG_DISABLE_CUDNN_CHECK=1
+
+ENV HIP_FORCE_DEV_KERNARG=1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+ENV SGLANG_INT4_WEIGHT=0
+ENV SGLANG_MOE_PADDING=1
+ENV SGLANG_ROCM_DISABLE_LINEARQUANT=0
+ENV SGLANG_ROCM_FUSED_DECODE_MLA=1
+ENV SGLANG_SET_CPU_AFFINITY=1
+ENV SGLANG_USE_AITER=1
+ENV SGLANG_USE_ROCM700A=1
+
+ENV NCCL_MIN_NCHANNELS=112
+ENV VLLM_FP8_PADDING=1
+ENV VLLM_FP8_ACT_PADDING=1
+ENV VLLM_FP8_WEIGHT_PADDING=1
+ENV VLLM_FP8_REDUCE_CONV=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
+ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
+
+CMD ["/bin/bash"]
diff --git a/docker/sglang.patch b/docker/sglang.patch
new file mode 100644
index 000000000000..3f9abb3d5177
--- /dev/null
+++ b/docker/sglang.patch
@@ -0,0 +1,189 @@
+From 82591a6aae07773677523ee715f14d20475906c0 Mon Sep 17 00:00:00 2001
+From: wunhuang <wunhuang@amd.com>
+Date: Wed, 21 Jan 2026 07:07:18 +0000
+Subject: [PATCH] Patch for #17735
+
+* Add aiter bias-MoE support for gpt-oss
+* Use helper function round_up to calulate padding size
+* Remove some comment code
+---
+ .../sglang/srt/layers/quantization/mxfp4.py   | 109 +++++++++++++++++-
+ python/sglang/srt/server_args.py              |   7 ++
+ 2 files changed, 115 insertions(+), 1 deletion(-)
+
+diff --git a/python/sglang/srt/layers/quantization/mxfp4.py b/python/sglang/srt/layers/quantization/mxfp4.py
+index 537405e2d..3690b4d59 100644
+--- a/python/sglang/srt/layers/quantization/mxfp4.py
++++ b/python/sglang/srt/layers/quantization/mxfp4.py
+@@ -51,6 +51,7 @@ from sglang.srt.utils import (
+     round_up,
+     set_weight_attrs,
+ )
++from sglang.srt.utils.common import get_bool_env_var
+ from sglang.srt.utils.custom_op import register_custom_op
+ 
+ _is_sm100_supported = is_cuda() and is_sm100_supported()
+@@ -75,6 +76,7 @@ if TYPE_CHECKING:
+     )
+ 
+ _is_hip = is_hip()
++_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+ _is_shuffle_moe_mxfp4 = is_gfx95_supported()
+ 
+ if _is_hip:
+@@ -82,7 +84,11 @@ if _is_hip:
+     try:
+         from aiter import ActivationType, QuantType
+         from aiter.fused_moe import fused_moe
+-        from aiter.ops.shuffle import shuffle_weight
++        from aiter.ops.shuffle import (
++            shuffle_scale_a16w4,
++            shuffle_weight,
++            shuffle_weight_a16w4,
++        )
+         from aiter.ops.triton.quant import dynamic_mxfp4_quant
+         from aiter.utility.fp4_utils import e8m0_shuffle
+     except ImportError as err:
+@@ -292,6 +298,18 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
+                 intermediate_size_per_partition_after_pad = round_up(
+                     intermediate_size_per_partition, 64
+                 )
++        elif _use_aiter:
++
++            intermediate_size_per_partition_after_pad = round_up(
++                intermediate_size_per_partition, 256
++            )
++
++            hidden_size = round_up(hidden_size, 256)
++            self.hidden_pad = hidden_size - layer.hidden_size
++            self.intermediate_pad = (
++                intermediate_size_per_partition_after_pad
++                - layer.intermediate_size_per_partition
++            )
+         elif has_triton_kernels:
+             # TODO: this is a hack to make
+             # intermediate_size_per_partition_after_pad the same as the
+@@ -530,6 +548,58 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
+                 requires_grad=False,
+             )
+             return
++        if _use_aiter:
++            if layer.w13_weight_bias is not None:
++                layer.w13_weight_bias.data = layer.w13_weight_bias.data.to(
++                    torch.float32
++                )
++            if layer.w2_weight_bias is not None:
++                layer.w2_weight_bias.data = layer.w2_weight_bias.data.to(torch.float32)
++
++            e, n, k = layer.w13_weight.shape
++            layer.w13_weight.view(torch.uint8).copy_(
++                layer.w13_weight.data.view(torch.uint8)
++                .view(e, n // 2, 2, k)
++                .permute(0, 2, 1, 3)
++                .contiguous()
++                .view(e, n, k)
++            )
++            layer.w13_weight_scale.data = (
++                layer.w13_weight_scale.data.view(e, n // 2, 2, -1)
++                .permute(0, 2, 1, 3)
++                .contiguous()
++                .view(e, n, -1)
++            )
++
++            layer.w13_weight.data = shuffle_weight_a16w4(layer.w13_weight, 16, True)
++            shuffled_w13_scale = shuffle_scale_a16w4(
++                layer.w13_weight_scale.view(-1, layer.w13_weight_scale.shape[-1]),
++                self.num_experts,
++                True,
++            )
++
++            layer.w2_weight.data = shuffle_weight_a16w4(layer.w2_weight, 16, False)
++            shuffled_w2_scale = shuffle_scale_a16w4(
++                layer.w2_weight_scale.view(-1, layer.w2_weight_scale.shape[-1]),
++                self.num_experts,
++                False,
++            )
++
++            layer.w13_weight_bias.data = (
++                layer.w13_weight_bias.data.view(-1, n // 2, 2)
++                .permute(0, 2, 1)
++                .contiguous()
++                .view(-1, n)
++            )
++
++            layer.w13_weight_scale = torch.nn.Parameter(
++                shuffled_w13_scale, requires_grad=False
++            )
++            layer.w2_weight_scale = torch.nn.Parameter(
++                shuffled_w2_scale, requires_grad=False
++            )
++
++            return
+ 
+         if self.use_triton_kernels:
+ 
+@@ -680,6 +750,43 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
+                 output=symm_output,
+             )[0]
+             return StandardCombineInput(hidden_states=trtllm_gen_output)
++        if _use_aiter:
++            topk_weights, topk_ids, _ = topk_output
++
++            if hasattr(torch, "float4_e2m1fn_x2"):
++                w13_weight = layer.w13_weight.view(torch.float4_e2m1fn_x2)
++                w2_weight = layer.w2_weight.view(torch.float4_e2m1fn_x2)
++            else:
++                w13_weight = layer.w13_weight
++                w2_weight = layer.w2_weight
++
++            origi_hidden_size = self.hidden_size - self.hidden_pad
++
++            x = torch.nn.functional.pad(
++                x,
++                (0, self.hidden_pad),
++                mode="constant",
++                value=0.0,
++            )
++
++            output = fused_moe(
++                x,
++                w13_weight,
++                w2_weight,
++                topk_weights,
++                topk_ids,
++                expert_mask=layer.expert_mask_gpu,
++                activation=ActivationType.Swiglu,
++                quant_type=QuantType.per_1x32,
++                w1_scale=layer.w13_weight_scale,
++                w2_scale=layer.w2_weight_scale,
++                doweight_stage1=self.moe_runner_config.apply_router_weight_on_input,
++                hidden_pad=self.hidden_pad,
++                intermediate_pad=self.intermediate_pad,
++                bias1=layer.w13_weight_bias,
++                bias2=layer.w2_weight_bias,
++            )
++            return StandardCombineInput(hidden_states=output)
+ 
+         backend = self.runner.runner_backend
+         if backend.is_triton_kernels():
+diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
+index 49975de64..2230a9dae 100644
+--- a/python/sglang/srt/server_args.py
++++ b/python/sglang/srt/server_args.py
+@@ -1358,6 +1358,13 @@ class ServerArgs:
+                     logger.warning(
+                         "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
+                     )
++                elif (
++                    is_hip() and get_bool_env_var("SGLANG_USE_AITER")
++                ) and is_mxfp4_quant_format:
++                    self.moe_runner_backend = "auto"
++                    logger.warning(
++                        "Detected ROCm and MXFP4 quantization format for GPT-OSS model, enabling aiter MXFP4 MOE kernel."
++                    )
+                 elif self.ep_size == 1 and is_triton_kernels_available():
+                     self.moe_runner_backend = "triton_kernel"
+                     logger.warning(
+-- 
+2.34.1
+
diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py
index 7fde05894b59..6b5b724c9cd6 100644
--- a/python/sglang/srt/layers/layernorm.py
+++ b/python/sglang/srt/layers/layernorm.py
@@ -64,11 +64,21 @@
         gemma_rmsnorm,
         rmsnorm,
     )
+_vllm_layernorm_available = False
+rms_norm = None
+fused_add_rms_norm = None
+
 if _use_aiter:
     from aiter import rmsnorm2d_fwd as rms_norm
     from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm
 elif _is_hip:
-    from vllm._custom_ops import fused_add_rms_norm, rms_norm
+    try:
+        from vllm._custom_ops import fused_add_rms_norm, rms_norm
+
+        _vllm_layernorm_available = True
+    except ImportError:
+        # Will use forward_native as fallback
+        pass
 
 logger = logging.getLogger(__name__)
 
@@ -176,6 +186,10 @@ def forward_hip(
         residual: Optional[torch.Tensor] = None,
         post_residual_addition: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        # Fallback to native implementation if vllm is not available
+        if not _use_aiter and not _vllm_layernorm_available:
+            return self.forward_native(x, residual, post_residual_addition)
+
         if not x.is_contiguous():
             # NOTE: Remove this if aiter kernel supports discontinuous input
             x = x.contiguous()
diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
index a1885fade143..a21143b24705 100644
--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -53,8 +53,7 @@
             from aiter import moe_sum
         except ImportError:
             raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
-    else:
-        from vllm import _custom_ops as vllm_ops
+    # No vllm import needed - using triton/torch.compile fallback for moe_sum
 
 padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
 
@@ -492,9 +491,10 @@ def fused_experts_impl(
                         activation,
                     )
             else:
-                vllm_ops.silu_and_mul(
-                    intermediate_cache2, intermediate_cache1.view(-1, N)
-                )
+                # Native PyTorch fallback for non-CUDA/HIP environments
+                x = intermediate_cache1.view(-1, N)
+                d = x.shape[-1] // 2
+                intermediate_cache2.copy_(F.silu(x[..., :d]) * x[..., d:])
         elif activation == "gelu" and is_gated:
             assert gemm1_alpha is None, "gemm1_alpha is not supported for gelu"
             assert gemm1_limit is None, "gemm1_limit is not supported for gelu"
@@ -512,9 +512,10 @@ def fused_experts_impl(
                         activation,
                     )
             else:
-                vllm_ops.gelu_and_mul(
-                    intermediate_cache2, intermediate_cache1.view(-1, N)
-                )
+                # Native PyTorch fallback for non-CUDA/HIP environments
+                x = intermediate_cache1.view(-1, N)
+                d = x.shape[-1] // 2
+                intermediate_cache2.copy_(F.gelu(x[..., :d]) * x[..., d:])
         # Activation function without multiplication
         elif activation == "silu" and not is_gated:
             intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N))
@@ -607,9 +608,9 @@ def fused_experts_impl(
                         routed_scaling_factor,
                     )
         else:
-            vllm_ops.moe_sum(
-                intermediate_cache3.view(*intermediate_cache3.shape),
-                out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            # Native PyTorch fallback for non-CUDA/HIP environments
+            out_hidden_states[begin_chunk_idx:end_chunk_idx].copy_(
+                intermediate_cache3.view(*intermediate_cache3.shape).sum(dim=1)
             )
 
     return out_hidden_states
diff --git a/python/sglang/srt/layers/moe/moe_runner/triton.py b/python/sglang/srt/layers/moe/moe_runner/triton.py
index cdf3e9a471f3..c527118c737a 100644
--- a/python/sglang/srt/layers/moe/moe_runner/triton.py
+++ b/python/sglang/srt/layers/moe/moe_runner/triton.py
@@ -36,6 +36,8 @@
 _MOE_PADDING_SIZE = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
 
 
+_vllm_moe_available = False
+
 if _is_cuda or _is_hip:
     from sgl_kernel import gelu_and_mul, silu_and_mul
 
@@ -48,7 +50,13 @@
                     "aiter is required when SGLANG_USE_AITER is set to True"
                 )
         else:
-            from vllm import _custom_ops as vllm_ops  # moe_sum
+            try:
+                from vllm import _custom_ops as vllm_ops  # moe_sum
+
+                _vllm_moe_available = True
+            except ImportError:
+                # Will use triton fallback
+                pass
 elif _is_cpu and _is_cpu_amx_available:
     pass
 
@@ -307,11 +315,22 @@ def run(
                     intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states,
                 )
-            else:
+            elif _vllm_moe_available:
                 vllm_ops.moe_sum(
                     intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states,
                 )
+            else:
+                # Triton fallback when vllm is not available
+                from sglang.srt.layers.moe.fused_moe_triton.fused_moe_triton_kernels import (
+                    moe_sum_reduce_triton,
+                )
+
+                moe_sum_reduce_triton(
+                    intermediate_cache3.view(*intermediate_cache3.shape),
+                    out_hidden_states,
+                    1.0,  # routed_scaling_factor
+                )
         else:
             vllm_ops.moe_sum(
                 intermediate_cache3.view(*intermediate_cache3.shape),
diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py
index 7701f9757f52..2ce09e76e0b1 100644
--- a/python/sglang/srt/layers/quantization/fp8_kernel.py
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -58,6 +58,8 @@
 
         enable_sgl_per_token_group_quant_8bit = False
 
+_vllm_available = False
+
 if _is_hip:
     if _use_aiter:
         try:
@@ -69,10 +71,14 @@
         except ImportError:
             raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
     else:
+        # Try to import vllm for fp8 quant ops, but don't fail immediately
+        # The functions will raise ImportError when actually called if vllm is not available
         try:
             import vllm._C  # noqa: F401
+
+            _vllm_available = True
         except ImportError:
-            raise ImportError("vllm is required when SGLANG_USE_AITER is set to False")
+            pass
 
 logger = logging.getLogger(__name__)
 
@@ -1393,6 +1399,54 @@ def per_token_group_quant_mla_deep_gemm_masked_fp8(
 """
 if _is_hip:
 
+    def _triton_dynamic_per_token_quant_fp8(output, input, scale):
+        """Triton fallback for dynamic per-token FP8 quantization."""
+        M, N = input.shape
+        BLOCK = triton.next_power_of_2(N)
+        num_warps = min(max(BLOCK // 256, 1), 8)
+        eps = 1e-10
+        if _is_hip:
+            bit8_max = 224.0
+        else:
+            bit8_max = fp8_max
+        bit8_min = -bit8_max
+        _per_token_group_quant_8bit[(M,)](
+            input,
+            output,
+            scale,
+            N,  # group_size = N (per token)
+            N,
+            eps,
+            bit8_min=bit8_min,
+            bit8_max=bit8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=1,
+        )
+
+    def _triton_dynamic_per_tensor_quant_fp8(output, input, scale):
+        """Triton fallback for dynamic per-tensor FP8 quantization."""
+        # Compute scale from input
+        eps = 1e-10
+        if _is_hip:
+            bit8_max = 224.0
+        else:
+            bit8_max = fp8_max
+        absmax = torch.max(torch.abs(input)).item()
+        scale_val = max(absmax, eps) / bit8_max
+        scale.fill_(scale_val)
+        # Quantize with computed scale
+        output.copy_((input / scale_val).clamp(-bit8_max, bit8_max).to(output.dtype))
+
+    def _triton_static_quant_fp8(output, input, scale):
+        """Triton fallback for static FP8 quantization."""
+        if _is_hip:
+            bit8_max = 224.0
+        else:
+            bit8_max = fp8_max
+        scale_val = scale.item()
+        output.copy_((input / scale_val).clamp(-bit8_max, bit8_max).to(output.dtype))
+
     def scaled_fp8_quant(
         input: torch.Tensor,
         scale: Optional[torch.Tensor] = None,
@@ -1413,16 +1467,22 @@ def scaled_fp8_quant(
                 )
                 if _use_aiter:
                     dynamic_per_token_scaled_quant(output, input, scale)
-                else:
+                elif _vllm_available:
                     torch.ops._C.dynamic_per_token_scaled_fp8_quant(
                         output, input.contiguous(), scale, None
                     )
+                else:
+                    _triton_dynamic_per_token_quant_fp8(
+                        output, input.contiguous(), scale
+                    )
             else:
                 scale = torch.zeros(1, device=input.device, dtype=torch.float32)
                 if _use_aiter:
                     dynamic_per_tensor_quant(output, input, scale)
-                else:
+                elif _vllm_available:
                     torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+                else:
+                    _triton_dynamic_per_tensor_quant_fp8(output, input, scale)
         else:
             # Static scaling
             assert (
@@ -1430,8 +1490,10 @@ def scaled_fp8_quant(
             ), f"Expected scalar scale, got numel={scale.numel()}"
             if _use_aiter:
                 static_per_tensor_quant(output, input, scale)
-            else:
+            elif _vllm_available:
                 torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+            else:
+                _triton_static_quant_fp8(output, input, scale)
 
         return output, scale
 
diff --git a/scripts/ci/amd/amd_ci_start_container.sh b/scripts/ci/amd/amd_ci_start_container.sh
index ad6cc198bf89..7539a80ac938 100755
--- a/scripts/ci/amd/amd_ci_start_container.sh
+++ b/scripts/ci/amd/amd_ci_start_container.sh
@@ -27,13 +27,25 @@ DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x"
 # Parse command line arguments
 MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
 MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}"
+CUSTOM_IMAGE=""
+BUILD_FROM_DOCKERFILE=""
+GPU_ARCH_BUILD=""
 
 while [[ $# -gt 0 ]]; do
   case $1 in
     --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;;
     --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;;
+    --custom-image) CUSTOM_IMAGE="$2"; shift 2;;
+    --build-from-dockerfile) BUILD_FROM_DOCKERFILE="1"; shift;;
+    --gpu-arch) GPU_ARCH_BUILD="$2"; shift 2;;
     -h|--help)
-      echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]"
+      echo "Usage: $0 [OPTIONS]"
+      echo "Options:"
+      echo "  --mi30x-base-tag TAG       Override MI30x base image tag"
+      echo "  --mi35x-base-tag TAG       Override MI35x base image tag"
+      echo "  --custom-image IMAGE       Use a specific Docker image directly"
+      echo "  --build-from-dockerfile    Build image from docker/rocm.Dockerfile"
+      echo "  --gpu-arch ARCH            GPU architecture for Dockerfile build (e.g., gfx950-rocm720)"
       exit 0
       ;;
     *) echo "Unknown option $1"; exit 1;;
@@ -142,10 +154,50 @@ find_latest_image() {
   fi
 }
 
-# Pull and run the latest image
-IMAGE=$(find_latest_image "${GPU_ARCH}")
-echo "Pulling Docker image: ${IMAGE}"
-docker pull "${IMAGE}"
+# Determine which image to use
+if [[ -n "${CUSTOM_IMAGE}" ]]; then
+  # Use explicitly provided custom image
+  IMAGE="${CUSTOM_IMAGE}"
+  echo "Using custom image: ${IMAGE}"
+  docker pull "${IMAGE}"
+elif [[ -n "${BUILD_FROM_DOCKERFILE}" ]]; then
+  # Build image from Dockerfile
+  if [[ -z "${GPU_ARCH_BUILD}" ]]; then
+    echo "Error: --gpu-arch is required when using --build-from-dockerfile" >&2
+    exit 1
+  fi
+  
+  DOCKERFILE_DIR="${GITHUB_WORKSPACE:-$PWD}/docker"
+  
+  # Use rocm720.Dockerfile for ROCm 7.2 builds, otherwise use rocm.Dockerfile
+  if [[ "${GPU_ARCH_BUILD}" == *"rocm720"* ]]; then
+    DOCKERFILE="${DOCKERFILE_DIR}/rocm720.Dockerfile"
+  else
+    DOCKERFILE="${DOCKERFILE_DIR}/rocm.Dockerfile"
+  fi
+  
+  if [[ ! -f "${DOCKERFILE}" ]]; then
+    echo "Error: Dockerfile not found at ${DOCKERFILE}" >&2
+    exit 1
+  fi
+  
+  IMAGE="sglang-ci:${GPU_ARCH_BUILD}-$(date +%Y%m%d)"
+  echo "Building Docker image from ${DOCKERFILE} with GPU_ARCH=${GPU_ARCH_BUILD}..."
+  
+  # Pass full GPU_ARCH (e.g., gfx950-rocm720) - Dockerfile handles stripping suffix
+  docker build \
+    --build-arg GPU_ARCH="${GPU_ARCH_BUILD}" \
+    --build-arg SGL_BRANCH="main" \
+    -t "${IMAGE}" \
+    -f "${DOCKERFILE}" \
+    "${DOCKERFILE_DIR}"
+  echo "Successfully built image: ${IMAGE}"
+else
+  # Find the latest pre-built image
+  IMAGE=$(find_latest_image "${GPU_ARCH}")
+  echo "Pulling Docker image: ${IMAGE}"
+  docker pull "${IMAGE}"
+fi
 
 CACHE_HOST=/home/runner/sgl-data
 if [[ -d "$CACHE_HOST" ]]; then