diff --git a/.github/workflows/nightly-test-amd-rocm720.yml b/.github/workflows/nightly-test-amd-rocm720.yml new file mode 100644 index 000000000000..5e9fe8d1ce8d --- /dev/null +++ b/.github/workflows/nightly-test-amd-rocm720.yml @@ -0,0 +1,789 @@ +name: Nightly Test (AMD ROCm 7.2) + +on: [] +# pull_request: +# branches: +# - main +# paths: +# - "docker/rocm720.Dockerfile" +# - "scripts/ci/amd/amd_ci_start_container.sh" +# - ".github/workflows/nightly-test-amd-rocm720.yml" +# workflow_dispatch: +# inputs: +# job_filter: +# description: 'Select which job to run (leave empty or "all" to run all jobs)' +# required: false +# type: choice +# default: 'all' +# options: +# - 'all' +# # MI30x ROCm 7.2 Tests +# - 'nightly-test-1-gpu-unit-rocm720' +# # MI30x Accuracy Tests (GSM8K / MMMU) +# - 'nightly-accuracy-2-gpu-rocm720' +# - 'nightly-accuracy-2-gpu-vlm-rocm720' +# - 'nightly-perf-2-gpu-text-rocm720' +# - 'nightly-perf-2-gpu-vlm-rocm720' +# - 'nightly-accuracy-8-gpu-rocm720' +# # MI30x Accuracy + Performance Tests (combined) +# - 'nightly-8-gpu-grok1-int4-rocm720' +# - 'nightly-8-gpu-grok2-rocm720' +# - 'nightly-8-gpu-deepseek-v31-rocm720' +# - 'nightly-8-gpu-deepseek-v32-rocm720' +# - 'nightly-8-gpu-deepseek-v32-mtp-rocm720' +# - 'nightly-8-gpu-kimi-k2-rocm720' +# # MI35x jobs +# - 'nightly-test-1-gpu-mi35x-rocm720' +# - 'nightly-accuracy-8-gpu-mi35x-rocm720' +# - 'nightly-8-gpu-mi35x-grok1-int4-rocm720' +# - 'nightly-8-gpu-mi35x-grok2-rocm720' +# - 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720' +# - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720' +# - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720' +# - 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720' +# - 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720' +# ref: +# description: 'Git ref (branch, tag, or SHA) to test' +# required: false +# type: string +# default: '' + +concurrency: + group: nightly-test-amd-rocm720-${{ inputs.ref || github.ref }} + cancel-in-progress: true + +jobs: + # ============================================== MI30x ROCm 7.2 Unit Tests ============================================== + # 1-GPU Unit Tests - LoRA, debug utils, scheduler, etc. (MI30x ROCm 7.2) + nightly-test-1-gpu-unit-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-unit-rocm720' + runs-on: linux-mi325-gpu-1 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Nightly Unit Test ROCm 7.2 (1-GPU) + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 600 --continue-on-error || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== MI30x ROCm 7.2 Accuracy Tests ============================================== + # 2-GPU Accuracy Tests - GSM8K eval (MI30x ROCm 7.2) + nightly-accuracy-2-gpu-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-rocm720' + runs-on: linux-mi325-gpu-2 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Nightly Test ROCm 7.2 (2-GPU) + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 2-GPU VLM Accuracy Tests - Vision-Language Models MMMU evaluation (ROCm 7.2) + nightly-accuracy-2-gpu-vlm-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-vlm-rocm720' + runs-on: linux-mi325-gpu-2 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Nightly Accuracy Test ROCm 7.2 (2-GPU VLM MMMU) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 2-GPU Text Models Performance Tests (ROCm 7.2) + nightly-perf-2-gpu-text-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-text-rocm720' + runs-on: linux-mi325-gpu-2 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Performance Test ROCm 7.2 (2-GPU Text Models) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 2-GPU VLM Performance Tests (ROCm 7.2) + nightly-perf-2-gpu-vlm-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-vlm-rocm720' + runs-on: linux-mi325-gpu-2 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Performance Test ROCm 7.2 (2-GPU VLM Models) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU Accuracy Tests - GPT-OSS, Grok1-FP8 (ROCm 7.2) + nightly-accuracy-8-gpu-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-rocm720' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test ROCm 7.2 (8-GPU GPT-OSS) + timeout-minutes: 180 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Accuracy Test ROCm 7.2 (8-GPU Grok1-FP8) + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== MI30x ROCm 7.2 Combined Accuracy + Performance Tests ============================================== + # 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-grok1-int4-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok1-int4-rocm720' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test ROCm 7.2 (8-GPU Grok1-INT4) + timeout-minutes: 60 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test ROCm 7.2 (8-GPU Grok1-INT4) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-grok2-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok2-rocm720' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test ROCm 7.2 (8-GPU Grok2) + timeout-minutes: 60 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test ROCm 7.2 (8-GPU Grok2) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU DeepSeek-V3.1 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-deepseek-v31-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.1) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.1) + timeout-minutes: 300 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_ROCM700A=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU DeepSeek-V3.2 (Basic Accuracy + Perf) + nightly-8-gpu-deepseek-v32-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic) + timeout-minutes: 150 + continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU DeepSeek-V3.2 MTP (MTP Accuracy + Perf) + nightly-8-gpu-deepseek-v32-mtp-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker ROCm 7.2 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP) + timeout-minutes: 180 + continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # 8-GPU Kimi-K2 (Accuracy + Speed) + nightly-8-gpu-kimi-k2-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker ROCm 7.2 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test ROCm 7.2 (8-GPU Kimi-K2) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== MI35x ROCm 7.2 Tests ============================================== + # MI35x 1-GPU ROCm 7.2 tests - builds from Dockerfile with gfx950-rocm720 + nightly-test-1-gpu-mi35x-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-mi35x-rocm720' + runs-on: linux-mi35x-gpu-1 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Test MI35x ROCm 7.2 (1-GPU) + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 600 --continue-on-error || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Accuracy Tests - GPT-OSS (accuracy only) + nightly-accuracy-8-gpu-mi35x-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-rocm720' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU GPT-OSS) + timeout-minutes: 180 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-grok1-int4-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok1-int4-rocm720' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU Grok1-INT4) + timeout-minutes: 60 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU Grok1-INT4) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-grok2-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok2-rocm720' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU Grok2) + timeout-minutes: 60 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU Grok2) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-R1-MXFP4 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4) + timeout-minutes: 300 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-V3.2 Accuracy Test (ROCm 7.2) + nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-V3.2 TP+MTP Accuracy Test + nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Accuracy Test ROCm 7.2 MI35x (8-GPU DeepSeek-V3.2 TP+MTP) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-V3.2 Performance Test (Basic) ROCm 7.2 + nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic) + timeout-minutes: 150 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP) ROCm 7.2 + nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720: + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + timeout-minutes: 120 + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP) + timeout-minutes: 150 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + check-all-jobs: + if: always() + needs: + # MI30x ROCm 7.2 Tests + - nightly-test-1-gpu-unit-rocm720 + - nightly-accuracy-2-gpu-rocm720 + - nightly-accuracy-2-gpu-vlm-rocm720 + # MI30x Performance Tests + - nightly-perf-2-gpu-text-rocm720 + - nightly-perf-2-gpu-vlm-rocm720 + - nightly-accuracy-8-gpu-rocm720 + - nightly-8-gpu-grok1-int4-rocm720 + - nightly-8-gpu-grok2-rocm720 + - nightly-8-gpu-deepseek-v31-rocm720 + - nightly-8-gpu-deepseek-v32-rocm720 + - nightly-8-gpu-deepseek-v32-mtp-rocm720 + - nightly-8-gpu-kimi-k2-rocm720 + # MI35x jobs + - nightly-test-1-gpu-mi35x-rocm720 + - nightly-accuracy-8-gpu-mi35x-rocm720 + - nightly-8-gpu-mi35x-grok1-int4-rocm720 + - nightly-8-gpu-mi35x-grok2-rocm720 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720 + - nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720 + - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720 + - nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720 + - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720 + runs-on: ubuntu-latest + steps: + - name: Check if any job failed + run: | + if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then + echo "One or more ROCm 7.2 test jobs failed" + exit 1 + fi + if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then + echo "One or more ROCm 7.2 test jobs were cancelled" + exit 1 + fi + echo "All ROCm 7.2 test jobs passed" diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml new file mode 100644 index 000000000000..5ca23d7dc861 --- /dev/null +++ b/.github/workflows/pr-test-amd-rocm720.yml @@ -0,0 +1,944 @@ +name: PR Test ROCm 7.2 (AMD) +# Dynamic run-name for /rerun-stage commands to enable URL lookup +# Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs +run-name: ${{ inputs.target_stage && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage)) || '' }} + +on: [] +# push: +# branches: [ main ] +# paths: +# - "python/**" +# - "scripts/ci/**" +# - "test/**" +# - "sgl-kernel/**" +# - ".github/workflows/pr-test-amd.yml" +# - "docker/rocm.Dockerfile" +# pull_request: +# branches: [ main ] +# paths: +# - "python/**" +# - "scripts/ci/**" +# - "test/**" +# - "sgl-kernel/**" +# - ".github/workflows/pr-test-amd.yml" +# - "docker/rocm.Dockerfile" +# workflow_dispatch: +# inputs: +# target_stage: +# description: "Specific stage to run (optional, for quick testing)" +# required: false +# type: string +# default: "" +# pr_head_sha: +# description: "PR head SHA to checkout (for /rerun-stage on fork PRs)" +# required: false +# type: string +# default: "" +# workflow_call: +# inputs: +# ref: +# description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.' +# required: false +# type: string +# default: '' +# run_all_tests: +# description: "Run all tests (for releasing or testing purpose)" +# required: false +# type: boolean +# default: false + +concurrency: + # Include pr_head_sha in group for /rerun-stage dispatches to avoid collisions with main branch runs + group: pr-test-amd-${{ inputs.pr_head_sha || inputs.ref || github.ref }} + cancel-in-progress: ${{ github.event_name != 'workflow_call' }} + +jobs: + call-gate: + uses: ./.github/workflows/pr-gate.yml + secrets: inherit + check-changes: + needs: [call-gate] + runs-on: ubuntu-latest + outputs: + main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} + sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }} + multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Determine run mode + id: run-mode + run: | + # Run all tests for workflow_call (when ref input is provided) + # Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref + if [[ "${{ inputs.run_all_tests }}" == "true" ]]; then + echo "run_all_tests=true" >> $GITHUB_OUTPUT + echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }})" + else + echo "run_all_tests=false" >> $GITHUB_OUTPUT + echo "Run mode: FILTERED (triggered by ${{ github.event_name }})" + fi + + - name: Detect file changes + id: filter + uses: dorny/paths-filter@v3 + if: steps.run-mode.outputs.run_all_tests != 'true' + with: + filters: | + main_package: + - "python/sglang/!(multimodal_gen)/**" + - "python/pyproject_rocm.toml" + - "python/pyproject_other.toml" + - "scripts/ci/amd/*" + - "scripts/ci/utils/*" + - "test/**" + - ".github/workflows/pr-test-amd.yml" + sgl_kernel: + - "sgl-kernel/**" + - ".github/workflows/pr-test-amd.yml" + multimodal_gen: + - "python/sglang/multimodal_gen/**" + - "python/sglang/cli/**" + - "python/pyproject_rocm.toml" + - "python/pyproject_other.toml" + + # =============================================== sgl-kernel ==================================================== + sgl-kernel-unit-test-amd: + needs: [check-changes] + if: | + always() && + ( + (inputs.target_stage == 'sgl-kernel-unit-test-amd') || + ( + !inputs.target_stage && + needs.check-changes.outputs.sgl_kernel == 'true' + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Run test + timeout-minutes: 14 + run: | + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py + docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py + docker exec -w /sglang-checkout/sgl-kernel/tests/sgl_diffusion ci_sglang python3 -m pytest test_timestep_embedding.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py + + sgl-kernel-unit-test-2-gpu-amd: + needs: [check-changes] + if: | + always() && + ( + (inputs.target_stage == 'sgl-kernel-unit-test-2-gpu-amd') || + ( + !inputs.target_stage && + needs.check-changes.outputs.sgl_kernel == 'true' + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-2] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Run test + timeout-minutes: 20 + run: | + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_deterministic_custom_allreduce.py + docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_nccl_allreduce_determinism.py + + # =============================================== primary ==================================================== + + stage-a-test-1-amd: + needs: [check-changes] + if: | + always() && + ( + (inputs.target_stage == 'stage-a-test-1-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Run test + timeout-minutes: 10 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd + + stage-b-test-small-1-gpu-amd: + needs: [check-changes, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-small-1-gpu-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-1] + part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl + docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true + docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true + docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 + + stage-b-test-small-1-gpu-amd-mi35x: + needs: [check-changes, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-mi35x') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi35x-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl + docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true + docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true + docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x + + stage-b-test-large-2-gpu-amd: + needs: [check-changes, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-large-2-gpu-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-2] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl + docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true + docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true + docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd + + multimodal-gen-test-1-gpu-amd: + needs: [check-changes] + if: needs.check-changes.outputs.multimodal_gen == 'true' + strategy: + fail-fast: false + max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT + matrix: + runner: [linux-mi325-gpu-1] + part: [0, 1] # 2 partitions: 11 tests ÷ 2 = ~5-6 tests each + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Download artifacts + if: needs.check-changes.outputs.sgl_kernel == 'true' + uses: actions/download-artifact@v4 + with: + path: sgl-kernel/dist/ + merge-multiple: true + pattern: wheel-python3.10-cuda12.9 + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Setup kernel caches + run: | + # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) + # This directory persists across container restarts on the self-hosted runner + docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub + + # Clear pre-built AITER kernels from Docker image to avoid segfaults + # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/ + echo "Clearing pre-built AITER kernels from Docker image..." + docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true + docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true + echo "AITER kernels cleared - will be rebuilt on first use" + + # Create persistent cache marker if /sgl-data is a real mount (not ephemeral) + # This tells the test cleanup code to NOT delete downloaded models + if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then + docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache + echo "Created .persistent_cache marker - HF cache will persist" + else + echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" + fi + + # Check MIOpen cache (VAE convolution kernels) + miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") + echo "Found ${miopen_files} MIOpen cache files" + + - name: Diagnose HF cache and system resources + run: | + echo "=== System Memory Status ===" + free -h + echo "" + echo "=== Disk Space ===" + df -h /home/runner/sgl-data 2>/dev/null || df -h + echo "" + echo "=== HF Cache Directory Structure ===" + docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" + docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" + echo "" + echo "=== Checking for cached diffusion models (1-GPU tests) ===" + # Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2 + for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do + cache_path="/sgl-data/hf-cache/hub/models--${model}" + if docker exec ci_sglang test -d "$cache_path"; then + size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) + echo "✓ CACHED: $model ($size)" + else + echo "✗ NOT CACHED: $model" + fi + done + echo "" + echo "=== GPU Memory Status ===" + docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" + + - name: Run diffusion server tests (1-GPU) + timeout-minutes: 45 + run: | + # AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path) + # Tests: T2V, T2I, I2V, LoRA + # + # HF download env vars: + # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) + # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings + docker exec \ + -e SGLANG_E2E_TOLERANCE=0.3 \ + -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ + -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ + -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ + -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ + -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ + -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ + -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ + -e HF_HUB_ENABLE_HF_TRANSFER=1 \ + -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ + -w /sglang-checkout/python \ + ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ + --suite 1-gpu \ + --partition-id ${{ matrix.part }} \ + --total-partitions 2 \ + -k "not flux_2" + + # Post-test diagnostics + echo "=== Post-test System Memory Status ===" + free -h + + multimodal-gen-test-2-gpu-amd: + needs: [check-changes] + if: needs.check-changes.outputs.multimodal_gen == 'true' + strategy: + fail-fast: false + max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT + matrix: + runner: [linux-mi325-gpu-2] + part: [0, 1] # 2 partitions: 9 tests ÷ 2 = ~4-5 tests each + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Download artifacts + if: needs.check-changes.outputs.sgl_kernel == 'true' + uses: actions/download-artifact@v4 + with: + path: sgl-kernel/dist/ + merge-multiple: true + pattern: wheel-python3.10-cuda12.9 + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Setup kernel caches + run: | + # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) + docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub + + # Clear pre-built AITER kernels from Docker image to avoid segfaults + # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/ + echo "Clearing pre-built AITER kernels from Docker image..." + docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true + docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true + echo "AITER kernels cleared - will be rebuilt on first use" + + # Create persistent cache marker if /sgl-data is a real mount (not ephemeral) + # This tells the test cleanup code to NOT delete downloaded models + if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then + docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache + echo "Created .persistent_cache marker - HF cache will persist" + else + echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" + fi + + # Check MIOpen cache (VAE convolution kernels) + miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") + echo "Found ${miopen_files} MIOpen cache files" + + - name: Diagnose HF cache and system resources + run: | + echo "=== System Memory Status ===" + free -h + echo "" + echo "=== Disk Space ===" + df -h /home/runner/sgl-data 2>/dev/null || df -h + echo "" + echo "=== HF Cache Directory Structure ===" + docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" + docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" + echo "" + echo "=== Checking for cached diffusion models (2-GPU tests) ===" + # Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1 + for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do + cache_path="/sgl-data/hf-cache/hub/models--${model}" + if docker exec ci_sglang test -d "$cache_path"; then + size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) + echo "✓ CACHED: $model ($size)" + else + echo "✗ NOT CACHED: $model" + fi + done + echo "" + echo "=== GPU Memory Status ===" + docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" + + - name: Run diffusion server tests (2-GPU) + timeout-minutes: 80 + run: | + # AMD CI: All 2-GPU tests including LoRA + # Tests: T2V, T2I, I2V, LoRA + # + # HF download env vars: + # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) + # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings + docker exec \ + -e SGLANG_E2E_TOLERANCE=0.3 \ + -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ + -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ + -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ + -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ + -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ + -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ + -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ + -e HF_HUB_ENABLE_HF_TRANSFER=1 \ + -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ + -w /sglang-checkout/python \ + ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ + --suite 2-gpu \ + --partition-id ${{ matrix.part }} \ + --total-partitions 2 + + # Post-test diagnostics + echo "=== Post-test System Memory Status ===" + free -h + + + stage-c-test-large-8-gpu-amd: + needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-c-test-large-8-gpu-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + env: + RUNNER_LABELS: linux-mi325-gpu-8 + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-8] + part: [0, 1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Test RCCL multi-GPU communication + timeout-minutes: 5 + run: | + echo "Testing RCCL multi-GPU communication with debug info..." + docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py" + + - name: Run test + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 3600 + + stage-c-test-large-8-gpu-amd-mi35x: + needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-c-test-large-8-gpu-amd-mi35x') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi35x-gpu-8] + part: [0, 1, 2] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi35x-20260129-preview --gpu-arch gfx950-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Run test + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 + + stage-b-test-small-1-gpu-performance-amd: + needs: [check-changes, call-gate, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-small-1-gpu-performance-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl + docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true + docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true + docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-performance-amd --timeout-per-file 1200 + + stage-b-test-large-1-gpu-performance-amd: + needs: [check-changes, call-gate, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-large-1-gpu-performance-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-1] + part: [0, 1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl + docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true + docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true + docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-performance-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1200 + + stage-b-test-large-2-gpu-performance-amd: + needs: [check-changes, call-gate, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-large-2-gpu-performance-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-2] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl + docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true + docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true + docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-performance-amd --timeout-per-file 1200 + + stage-b-test-small-1-gpu-accuracy-amd: + needs: [check-changes, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-small-1-gpu-accuracy-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl + docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true + docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true + docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" -e SGLANG_USE_AITER=0 python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-accuracy-amd --timeout-per-file 1800 + + stage-b-test-large-2-gpu-accuracy-amd: + needs: [check-changes, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-large-2-gpu-accuracy-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-2] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --custom-image rocm/sgl-dev:v0.5.8-rocm720-mi30x-20260129-preview --gpu-arch gfx942-rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + docker exec -w / ci_sglang pip install --force-reinstall /triton-3.5.1+rocm7.2.0.gita272dfa8-cp310-cp310-linux_x86_64.whl + docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git || true + docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + docker exec -w / ci_sglang git clone --depth 1 https://github.com/merrymercy/human-eval.git || true + docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" -e SGLANG_USE_AITER_AR=0 -e SGLANG_USE_AITER=0 -e HF_HUB_ENABLE_HF_TRANSFER=0 python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-accuracy-amd --timeout-per-file 1800 + + pr-test-amd-finish: + needs: + [ + call-gate, + check-changes, + + sgl-kernel-unit-test-amd, + sgl-kernel-unit-test-2-gpu-amd, + multimodal-gen-test-1-gpu-amd, + multimodal-gen-test-2-gpu-amd, + + stage-a-test-1-amd, + stage-b-test-small-1-gpu-amd, + stage-b-test-small-1-gpu-amd-mi35x, + stage-b-test-large-2-gpu-amd, + stage-b-test-small-1-gpu-performance-amd, + stage-b-test-large-1-gpu-performance-amd, + stage-b-test-large-2-gpu-performance-amd, + stage-b-test-small-1-gpu-accuracy-amd, + stage-b-test-large-2-gpu-accuracy-amd, + stage-c-test-large-8-gpu-amd, + stage-c-test-large-8-gpu-amd-mi35x, + ] + if: always() + runs-on: ubuntu-latest + steps: + - name: Check all dependent job statuses + run: | + # Convert the 'needs' context to a JSON string + json_needs='${{ toJson(needs) }}' + + # Get a list of all job names from the JSON keys + job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]') + + for job in $job_names; do + # For each job, extract its result + result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result') + + # Print the job name and its result + echo "$job: $result" + + # Check for failure or cancellation and exit if found + if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then + echo "The above jobs failed." + exit 1 + fi + done + + # If the loop completes, all jobs were successful + echo "All jobs completed successfully" + exit 0 diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index a2e8e9988bfd..d701c6fe97e0 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -14,7 +14,7 @@ on: - ".github/workflows/pr-test-amd.yml" - "docker/rocm.Dockerfile" pull_request: - branches: [ main ] + branches: [ dont-trigger-this-one-anyway ] paths: - "python/**" - "scripts/ci/**" diff --git a/.github/workflows/release-docker-amd-rocm720-preview.yml b/.github/workflows/release-docker-amd-rocm720-preview.yml new file mode 100644 index 000000000000..bcf01907807a --- /dev/null +++ b/.github/workflows/release-docker-amd-rocm720-preview.yml @@ -0,0 +1,83 @@ +name: Release Docker Images ROCm 7.2.0 Preview (AMD) +on: + pull_request: + branches: + - main + paths: + - "docker/rocm720.Dockerfile" + - ".github/workflows/release-docker-amd-rocm720-preview.yml" + push: + tags: + - 'v[0-9]+.*' + workflow_dispatch: + inputs: + version: + description: 'Version to build (without v prefix, e.g., 0.5.7)' + required: true + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + publish: + if: github.repository == 'sgl-project/sglang' + runs-on: amd-docker-scale + environment: 'prod' + strategy: + fail-fast: false + matrix: + gpu_arch: ['gfx942-rocm720', 'gfx950-rocm720'] + build_type: ['all'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Required for git describe to find tags + + - name: "Set Date" + run: | + echo "DATE=$(date +%Y%m%d)" >> $GITHUB_ENV + + - name: Get version from latest tag + id: version + run: | + # Get the latest version tag sorted by version number (e.g., v0.5.7 -> 0.5.7) + VERSION=$(git tag -l 'v[0-9]*' --sort=-v:refname | head -1 | sed 's/^v//') + + if [ -z "$VERSION" ]; then + echo "::error::Could not determine version from git tags" + exit 1 + fi + + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "Detected version: ${VERSION}" + + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_AMD_USERNAME }} + password: ${{ secrets.DOCKERHUB_AMD_TOKEN }} + + - name: Build and Push + run: | + version=${{ steps.version.outputs.version }} + echo "Version: ${version}" + + if [ "${{ matrix.gpu_arch }}" = "gfx942-rocm720" ]; then + rocm_tag="rocm720-mi30x" + elif [ "${{ matrix.gpu_arch }}" = "gfx950-rocm720" ]; then + rocm_tag="rocm720-mi35x" + else + echo "Unsupported gfx arch" + exit 1 + fi + + tag=v${version}-${rocm_tag} + + docker build . -f docker/rocm720.Dockerfile --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} -t rocm/sgl-dev:${tag}-${{ env.DATE }}-preview --no-cache + docker push rocm/sgl-dev:${tag}-${{ env.DATE }}-preview diff --git a/docker/aiter.patch b/docker/aiter.patch new file mode 100644 index 000000000000..43187689435b --- /dev/null +++ b/docker/aiter.patch @@ -0,0 +1,48 @@ +diff --git a/csrc/py_itfs_cu/asm_mla.cu b/csrc/py_itfs_cu/asm_mla.cu +index 995364105..0adab889e 100644 +--- a/csrc/py_itfs_cu/asm_mla.cu ++++ b/csrc/py_itfs_cu/asm_mla.cu +@@ -283,14 +283,14 @@ void mla_decode_stage1_asm_fwd( + else if(max_seqlen_q <= 4) + { + // assert(false); +- //sub_Q = 128; +- //static AiterAsmKernel impl_fp8( +- // "_ZN5aiter36mla_a8w8_qh16_qseqlen4_gqaratio16_psE", +- // "/mla/mla_a8w8_qh16_qseqlen4_gqaratio16_ps.co"); +- sub_Q = 64; ++ sub_Q = 128; + static AiterAsmKernel impl_fp8( +- "_ZN5aiter36mla_a8w8_qh64_qseqlen4_gqaratio16_psE", +- "/mla/mla_a8w8_qh64_qseqlen4_gqaratio16_ps.co"); ++ "_ZN5aiter36mla_a8w8_qh16_qseqlen4_gqaratio16_psE", ++ "/mla/mla_a8w8_qh16_qseqlen4_gqaratio16_ps.co"); ++ //sub_Q = 64; ++ //static AiterAsmKernel impl_fp8( ++ // "_ZN5aiter36mla_a8w8_qh64_qseqlen4_gqaratio16_psE", ++ // "/mla/mla_a8w8_qh64_qseqlen4_gqaratio16_ps.co"); + impl_ptr = &impl_fp8; + } + else +@@ -319,14 +319,14 @@ void mla_decode_stage1_asm_fwd( + else if(max_seqlen_q <= 4) + { + // assert(false); +- //sub_Q = 128; +- //static AiterAsmKernel impl_fp8( +- // "_ZN5aiter33mla_a8w8_qh16_qseqlen4_gqaratio16E", +- // "/mla/mla_a8w8_qh16_qseqlen4_gqaratio16.co"); +- sub_Q = 64; ++ sub_Q = 128; + static AiterAsmKernel impl_fp8( +- "_ZN5aiter33mla_a8w8_qh64_qseqlen4_gqaratio16E", +- "/mla/mla_a8w8_qh64_qseqlen4_gqaratio16.co"); ++ "_ZN5aiter33mla_a8w8_qh16_qseqlen4_gqaratio16E", ++ "/mla/mla_a8w8_qh16_qseqlen4_gqaratio16.co"); ++ //sub_Q = 64; ++ //static AiterAsmKernel impl_fp8( ++ // "_ZN5aiter33mla_a8w8_qh64_qseqlen4_gqaratio16E", ++ // "/mla/mla_a8w8_qh64_qseqlen4_gqaratio16.co"); + impl_ptr = &impl_fp8; + } + else diff --git a/docker/rocm720.Dockerfile b/docker/rocm720.Dockerfile new file mode 100644 index 000000000000..b7eee9424215 --- /dev/null +++ b/docker/rocm720.Dockerfile @@ -0,0 +1,351 @@ +# ROCm 7.2 Dockerfile for SGLang (copied from akao-amd's rocm.Dockerfile for testing) +# Usage: +# docker build --build-arg SGL_BRANCH=9409c4359 --build-arg GPU_ARCH=gfx942-rocm720 -t sglang:rocm720-mi30x -f docker/rocm720.Dockerfile . +# docker build --build-arg SGL_BRANCH=9409c4359 --build-arg GPU_ARCH=gfx950-rocm720 -t sglang:rocm720-mi35x -f docker/rocm720.Dockerfile . + +# Default base images +ARG BASE_IMAGE_942="rocm/sgl-dev:vllm20250114" +ARG BASE_IMAGE_942_ROCM700="rocm/sgl-dev:rocm7-vllm-20250904" +ARG BASE_IMAGE_942_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1" +ARG BASE_IMAGE_950="rocm/sgl-dev:rocm7-vllm-20250904" +ARG BASE_IMAGE_950_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1" + +# This is necessary for scope purpose +ARG GPU_ARCH=gfx950 + +# =============================== +# Base image 942 with rocm630 and args +FROM $BASE_IMAGE_942 AS gfx942 +ENV BUILD_VLLM="0" +ENV BUILD_TRITON="1" +ENV BUILD_LLVM="0" +ENV BUILD_AITER_ALL="1" +ENV BUILD_MOONCAKE="1" +ENV AITER_COMMIT="v0.1.4" + +# =============================== +# Base image 942 with rocm700 and args +FROM $BASE_IMAGE_942_ROCM700 AS gfx942-rocm700 +ENV BUILD_VLLM="0" +ENV BUILD_TRITON="0" +ENV BUILD_LLVM="0" +ENV BUILD_AITER_ALL="1" +ENV BUILD_MOONCAKE="1" +ENV AITER_COMMIT="v0.1.9.post1" + +# =============================== +# Base image 942 with rocm720 and args +FROM $BASE_IMAGE_942_ROCM720 AS gfx942-rocm720 +ENV BUILD_VLLM="0" +ENV BUILD_TRITON="1" +ENV BUILD_LLVM="0" +ENV BUILD_AITER_ALL="1" +ENV BUILD_MOONCAKE="1" +ENV AITER_COMMIT="v0.1.9.post1" + +# =============================== +# Base image 950 and args +FROM $BASE_IMAGE_950 AS gfx950 +ENV BUILD_VLLM="0" +ENV BUILD_TRITON="0" +ENV BUILD_LLVM="0" +ENV BUILD_AITER_ALL="0" +ENV BUILD_MOONCAKE="1" +ENV AITER_COMMIT="v0.1.9.post1" + +# =============================== +# Base image 950 with rocm720 and args +FROM $BASE_IMAGE_950_ROCM720 AS gfx950-rocm720 +ENV BUILD_VLLM="0" +ENV BUILD_TRITON="1" +ENV BUILD_LLVM="0" +ENV BUILD_AITER_ALL="0" +ENV BUILD_MOONCAKE="1" +ENV AITER_COMMIT="v0.1.9.post1" + +# =============================== +# Chosen arch and args +FROM ${GPU_ARCH} + +# This is necessary for scope purpose, again +ARG GPU_ARCH=gfx950 +ENV GPU_ARCH_LIST=${GPU_ARCH%-*} + +ARG SGL_REPO="https://github.com/sgl-project/sglang.git" +ARG SGL_DEFAULT="main" +ARG SGL_BRANCH=${SGL_DEFAULT} + +ARG TRITON_REPO="https://github.com/triton-lang/triton.git" +ARG TRITON_COMMIT="42270451990532c67e69d753fbd026f28fcc4840" + +ARG AITER_REPO="https://github.com/ROCm/aiter.git" + +ARG LLVM_REPO="https://github.com/jrbyrnes/llvm-project.git" +ARG LLVM_BRANCH="MainOpSelV2" +ARG LLVM_COMMIT="6520ace8227ffe2728148d5f3b9872a870b0a560" + +ARG MOONCAKE_REPO="https://github.com/kvcache-ai/Mooncake.git" +ARG MOONCAKE_COMMIT="b6a841dc78c707ec655a563453277d969fb8f38d" + +ARG TILELANG_REPO="https://github.com/HaiShaw/tilelang.git" +ARG TILELANG_BRANCH="dsv32-mi35x" +ARG TILELANG_COMMIT="ae938cf885743f165a19656d1122ad42bb0e30b8" + +ARG FHT_REPO="https://github.com/jeffdaily/fast-hadamard-transform.git" +ARG FHT_BRANCH="rocm" +ARG FHT_COMMIT="46efb7d776d38638fc39f3c803eaee3dd7016bd1" +USER root + +# Install some basic utilities +RUN python -m pip install --upgrade pip && pip install setuptools_scm +RUN apt-get purge -y sccache; python -m pip uninstall -y sccache; rm -f "$(which sccache)" + +WORKDIR /sgl-workspace + +# ----------------------- +# llvm +RUN if [ "$BUILD_LLVM" = "1" ]; then \ + ENV HIP_CLANG_PATH="/sgl-workspace/llvm-project/build/bin/" \ + git clone --single-branch ${LLVM_REPO} -b ${LLVM_BRANCH} \ + && cd llvm-project \ + && git checkout ${LLVM_COMMIT} \ + && mkdir build \ + && cd build \ + && cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;" -DLLVM_ENABLE_RUNTIMES="compiler-rt" ../llvm \ + && make -j$(nproc); \ + fi + +# ----------------------- +# AITER +ENV MAX_JOBS=256 +RUN pip uninstall -y aiter +RUN pip install psutil pybind11 # Required by AITER setup.py +RUN git clone ${AITER_REPO} \ + && cd aiter \ + && sed -i setup.py -e 's/verbose.*/verbose=True,/' \ + && git checkout ${AITER_COMMIT} \ + && git submodule update --init --recursive +ADD docker/aiter.patch ./aiter +RUN cd aiter \ + && echo "[AITER] GPU_ARCH=${GPU_ARCH}" \ + && if [ "$BUILD_AITER_ALL" = "1" ] && [ "$BUILD_LLVM" = "1" ]; then \ + sh -c "HIP_CLANG_PATH=/sgl-workspace/llvm-project/build/bin/ PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \ + elif [ "$BUILD_AITER_ALL" = "1" ]; then \ + sh -c "PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \ + else \ + sh -c "GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \ + fi \ + && sh -c "patch -p1 < ./aiter.patch;" + +# ----------------------- +# Build vLLM +ARG VLLM_REPO="https://github.com/ROCm/vllm.git" +ARG VLLM_BRANCH="9f6b92db47c3444b7a7d67451ba0c3a2d6af4c2c" +RUN if [ "$BUILD_VLLM" = "1" ]; then \ + git clone ${VLLM_REPO} \ + && cd vllm \ + && git checkout ${VLLM_BRANCH} \ + && python -m pip install -r requirements/rocm.txt \ + && python setup.py clean --all \ + && python setup.py develop; \ + fi + +# ----------------------- +# Build Mooncake +ENV PATH=$PATH:/usr/local/go/bin + +RUN if [ "$BUILD_MOONCAKE" = "1" ]; then \ + apt update && apt install -y zip unzip wget && \ + apt install -y gcc make libtool autoconf librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils perftest ethtool libibverbs-dev rdma-core && \ + apt install -y openssh-server openmpi-bin openmpi-common libopenmpi-dev && \ + git clone ${MOONCAKE_REPO} && \ + cd Mooncake && \ + git checkout ${MOONCAKE_COMMIT} && \ + git submodule update --init --recursive && \ + bash dependencies.sh -y && \ + rm -rf /usr/local/go && \ + wget https://go.dev/dl/go1.22.2.linux-amd64.tar.gz && \ + tar -C /usr/local -xzf go1.22.2.linux-amd64.tar.gz && \ + rm go1.22.2.linux-amd64.tar.gz && \ + mkdir -p build && \ + cd build && \ + cmake .. -DUSE_HIP=ON -DUSE_ETCD=ON && \ + make -j "$(nproc)" && make install; \ + fi + +# ----------------------- +# Build SGLang +ARG BUILD_TYPE=all + +RUN pip install IPython \ + && pip install orjson \ + && pip install python-multipart \ + && pip install torchao==0.9.0 \ + && pip install pybind11 + +RUN pip uninstall -y sgl_kernel sglang +ADD docker/sglang.patch ./sglang +RUN git clone ${SGL_REPO} \ + && cd sglang \ + && patch -p1 < sglang.patch + && if [ "${SGL_BRANCH}" = ${SGL_DEFAULT} ]; then \ + echo "Using ${SGL_DEFAULT}, default branch."; \ + git checkout ${SGL_DEFAULT}; \ + else \ + echo "Using ${SGL_BRANCH} branch."; \ + git checkout ${SGL_BRANCH}; \ + fi \ + && cd sgl-kernel \ + && rm -f pyproject.toml \ + && mv pyproject_rocm.toml pyproject.toml \ + && AMDGPU_TARGET=$GPU_ARCH_LIST python setup_rocm.py install \ + && cd .. \ + && rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml \ + && if [ "$BUILD_TYPE" = "srt" ]; then \ + python -m pip --no-cache-dir install -e "python[srt_hip,diffusion_hip]"; \ + else \ + python -m pip --no-cache-dir install -e "python[all_hip,diffusion_hip]"; \ + fi + +RUN python -m pip cache purge + +# Copy config files to support MI300X in virtualized environments (MI300X_VF). Symlinks will not be created in image build. +RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \ + /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \ + -type f -name '*MI300X*' | xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {} + +# Install Rust toolchain for sgl-model-gateway +ENV PATH="/root/.cargo/bin:${PATH}" +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \ + && rustc --version && cargo --version + +# Build and install sgl-model-gateway +RUN python3 -m pip install --no-cache-dir setuptools-rust \ + && cd /sgl-workspace/sglang/sgl-model-gateway/bindings/python \ + && cargo build --release \ + && python3 -m pip install --no-cache-dir . \ + && rm -rf /root/.cache + +# ----------------------- +# TileLang +ENV DEBIAN_FRONTEND=noninteractive +ENV LIBGL_ALWAYS_INDIRECT=1 +RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment + +RUN /bin/bash -lc 'set -euo pipefail; \ + # Build TileLang only for gfx950 + if [ "${GPU_ARCH%-*}" != "gfx950" ]; then \ + echo "[TileLang] Skipping (GPU_ARCH=${GPU_ARCH:-unset})"; \ + exit 0; \ + fi; \ + echo "[TileLang] Building TileLang for ${GPU_ARCH}"; \ + \ + # System dependencies (NO llvm-dev to avoid llvm-config-16 shadowing) + apt-get update && apt-get install -y --no-install-recommends \ + build-essential git wget curl ca-certificates gnupg \ + libgtest-dev libgmock-dev \ + libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev \ + python3 python3-dev python3-setuptools python3-pip \ + gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev \ + cmake ninja-build pkg-config libstdc++6 \ + && rm -rf /var/lib/apt/lists/*; \ + \ + # Build GoogleTest static libs (Ubuntu package ships sources only) + cmake -S /usr/src/googletest -B /tmp/build-gtest -DBUILD_GTEST=ON -DBUILD_GMOCK=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build /tmp/build-gtest -j"$(nproc)" && \ + cp -v /tmp/build-gtest/lib/*.a /usr/lib/x86_64-linux-gnu/ && \ + rm -rf /tmp/build-gtest; \ + \ + # Keep setuptools < 80 (compat with base image) + python3 -m pip install --upgrade "setuptools>=77.0.3,<80" wheel cmake ninja && \ + python3 -m pip cache purge || true; \ + \ + # Locate ROCm llvm-config; fallback to installing LLVM 18 if missing + LLVM_CONFIG_PATH=""; \ + for p in /opt/rocm/llvm/bin/llvm-config /opt/rocm/llvm-*/bin/llvm-config /opt/rocm-*/llvm*/bin/llvm-config; do \ + if [ -x "$p" ]; then LLVM_CONFIG_PATH="$p"; break; fi; \ + done; \ + if [ -z "$LLVM_CONFIG_PATH" ]; then \ + echo "[TileLang] ROCm llvm-config not found; installing LLVM 18..."; \ + curl -fsSL https://apt.llvm.org/llvm.sh -o /tmp/llvm.sh; \ + chmod +x /tmp/llvm.sh; \ + /tmp/llvm.sh 18; \ + LLVM_CONFIG_PATH="$(command -v llvm-config-18)"; \ + if [ -z "$LLVM_CONFIG_PATH" ]; then echo "ERROR: llvm-config-18 not found after install"; exit 1; fi; \ + fi; \ + echo "[TileLang] Using LLVM_CONFIG at: $LLVM_CONFIG_PATH"; \ + export PATH="$(dirname "$LLVM_CONFIG_PATH"):/usr/local/bin:${PATH}"; \ + export LLVM_CONFIG="$LLVM_CONFIG_PATH"; \ + \ + # Optional shim for tools that expect llvm-config-16 + mkdir -p /usr/local/bin && \ + printf "#!/usr/bin/env bash\nexec \"%s\" \"\$@\"\n" "$LLVM_CONFIG_PATH" > /usr/local/bin/llvm-config-16 && \ + chmod +x /usr/local/bin/llvm-config-16; \ + \ + # TVM Python bits need Cython + python3 -m pip install --no-cache-dir "cython>=0.29.36,<3.0"; \ + \ + # Clone + pin TileLang (bundled TVM), then build + git clone --recursive --branch "${TILELANG_BRANCH}" "${TILELANG_REPO}" /opt/tilelang && \ + cd /opt/tilelang && \ + git fetch --depth=1 origin "${TILELANG_COMMIT}" || true && \ + git checkout -f "${TILELANG_COMMIT}" && \ + git submodule update --init --recursive && \ + export CMAKE_ARGS="-DLLVM_CONFIG=${LLVM_CONFIG} ${CMAKE_ARGS:-}" && \ + bash ./install_rocm.sh' + +# ----------------------- +# Hadamard-transform (HIP build) +RUN /bin/bash -lc 'set -euo pipefail; \ + git clone --branch "${FHT_BRANCH}" "${FHT_REPO}" fast-hadamard-transform; \ + cd fast-hadamard-transform; \ + git checkout -f "${FHT_COMMIT}"; \ + sed -i setup.py -e "/^.*torch\",$/d"; \ + pip show torch; \ + python setup.py install' + +# ----------------------- +# Python tools +RUN python3 -m pip install --no-cache-dir \ + py-spy \ + pre-commit \ + tabulate + +# ----------------------- +# Triton +RUN if [ "$BUILD_TRITON" = "1" ]; then \ + pip uninstall -y triton \ + && apt install -y cmake \ + && git clone ${TRITON_REPO} triton-custom \ + && cd triton-custom \ + && git checkout ${TRITON_COMMIT} \ + && pip install -r python/requirements.txt \ + && pip install -e .; \ + fi + +# ----------------------- +# Performance environment variable. + +# Skip CuDNN compatibility check - not applicable for ROCm (uses MIOpen instead) +ENV SGLANG_DISABLE_CUDNN_CHECK=1 + +ENV HIP_FORCE_DEV_KERNARG=1 +ENV HSA_NO_SCRATCH_RECLAIM=1 +ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 +ENV SGLANG_INT4_WEIGHT=0 +ENV SGLANG_MOE_PADDING=1 +ENV SGLANG_ROCM_DISABLE_LINEARQUANT=0 +ENV SGLANG_ROCM_FUSED_DECODE_MLA=1 +ENV SGLANG_SET_CPU_AFFINITY=1 +ENV SGLANG_USE_AITER=1 +ENV SGLANG_USE_ROCM700A=1 + +ENV NCCL_MIN_NCHANNELS=112 +ENV VLLM_FP8_PADDING=1 +ENV VLLM_FP8_ACT_PADDING=1 +ENV VLLM_FP8_WEIGHT_PADDING=1 +ENV VLLM_FP8_REDUCE_CONV=1 +ENV TORCHINDUCTOR_MAX_AUTOTUNE=1 +ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1 + +CMD ["/bin/bash"] diff --git a/docker/sglang.patch b/docker/sglang.patch new file mode 100644 index 000000000000..3f9abb3d5177 --- /dev/null +++ b/docker/sglang.patch @@ -0,0 +1,189 @@ +From 82591a6aae07773677523ee715f14d20475906c0 Mon Sep 17 00:00:00 2001 +From: wunhuang +Date: Wed, 21 Jan 2026 07:07:18 +0000 +Subject: [PATCH] Patch for #17735 + +* Add aiter bias-MoE support for gpt-oss +* Use helper function round_up to calulate padding size +* Remove some comment code +--- + .../sglang/srt/layers/quantization/mxfp4.py | 109 +++++++++++++++++- + python/sglang/srt/server_args.py | 7 ++ + 2 files changed, 115 insertions(+), 1 deletion(-) + +diff --git a/python/sglang/srt/layers/quantization/mxfp4.py b/python/sglang/srt/layers/quantization/mxfp4.py +index 537405e2d..3690b4d59 100644 +--- a/python/sglang/srt/layers/quantization/mxfp4.py ++++ b/python/sglang/srt/layers/quantization/mxfp4.py +@@ -51,6 +51,7 @@ from sglang.srt.utils import ( + round_up, + set_weight_attrs, + ) ++from sglang.srt.utils.common import get_bool_env_var + from sglang.srt.utils.custom_op import register_custom_op + + _is_sm100_supported = is_cuda() and is_sm100_supported() +@@ -75,6 +76,7 @@ if TYPE_CHECKING: + ) + + _is_hip = is_hip() ++_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip + _is_shuffle_moe_mxfp4 = is_gfx95_supported() + + if _is_hip: +@@ -82,7 +84,11 @@ if _is_hip: + try: + from aiter import ActivationType, QuantType + from aiter.fused_moe import fused_moe +- from aiter.ops.shuffle import shuffle_weight ++ from aiter.ops.shuffle import ( ++ shuffle_scale_a16w4, ++ shuffle_weight, ++ shuffle_weight_a16w4, ++ ) + from aiter.ops.triton.quant import dynamic_mxfp4_quant + from aiter.utility.fp4_utils import e8m0_shuffle + except ImportError as err: +@@ -292,6 +298,18 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 64 + ) ++ elif _use_aiter: ++ ++ intermediate_size_per_partition_after_pad = round_up( ++ intermediate_size_per_partition, 256 ++ ) ++ ++ hidden_size = round_up(hidden_size, 256) ++ self.hidden_pad = hidden_size - layer.hidden_size ++ self.intermediate_pad = ( ++ intermediate_size_per_partition_after_pad ++ - layer.intermediate_size_per_partition ++ ) + elif has_triton_kernels: + # TODO: this is a hack to make + # intermediate_size_per_partition_after_pad the same as the +@@ -530,6 +548,58 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): + requires_grad=False, + ) + return ++ if _use_aiter: ++ if layer.w13_weight_bias is not None: ++ layer.w13_weight_bias.data = layer.w13_weight_bias.data.to( ++ torch.float32 ++ ) ++ if layer.w2_weight_bias is not None: ++ layer.w2_weight_bias.data = layer.w2_weight_bias.data.to(torch.float32) ++ ++ e, n, k = layer.w13_weight.shape ++ layer.w13_weight.view(torch.uint8).copy_( ++ layer.w13_weight.data.view(torch.uint8) ++ .view(e, n // 2, 2, k) ++ .permute(0, 2, 1, 3) ++ .contiguous() ++ .view(e, n, k) ++ ) ++ layer.w13_weight_scale.data = ( ++ layer.w13_weight_scale.data.view(e, n // 2, 2, -1) ++ .permute(0, 2, 1, 3) ++ .contiguous() ++ .view(e, n, -1) ++ ) ++ ++ layer.w13_weight.data = shuffle_weight_a16w4(layer.w13_weight, 16, True) ++ shuffled_w13_scale = shuffle_scale_a16w4( ++ layer.w13_weight_scale.view(-1, layer.w13_weight_scale.shape[-1]), ++ self.num_experts, ++ True, ++ ) ++ ++ layer.w2_weight.data = shuffle_weight_a16w4(layer.w2_weight, 16, False) ++ shuffled_w2_scale = shuffle_scale_a16w4( ++ layer.w2_weight_scale.view(-1, layer.w2_weight_scale.shape[-1]), ++ self.num_experts, ++ False, ++ ) ++ ++ layer.w13_weight_bias.data = ( ++ layer.w13_weight_bias.data.view(-1, n // 2, 2) ++ .permute(0, 2, 1) ++ .contiguous() ++ .view(-1, n) ++ ) ++ ++ layer.w13_weight_scale = torch.nn.Parameter( ++ shuffled_w13_scale, requires_grad=False ++ ) ++ layer.w2_weight_scale = torch.nn.Parameter( ++ shuffled_w2_scale, requires_grad=False ++ ) ++ ++ return + + if self.use_triton_kernels: + +@@ -680,6 +750,43 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): + output=symm_output, + )[0] + return StandardCombineInput(hidden_states=trtllm_gen_output) ++ if _use_aiter: ++ topk_weights, topk_ids, _ = topk_output ++ ++ if hasattr(torch, "float4_e2m1fn_x2"): ++ w13_weight = layer.w13_weight.view(torch.float4_e2m1fn_x2) ++ w2_weight = layer.w2_weight.view(torch.float4_e2m1fn_x2) ++ else: ++ w13_weight = layer.w13_weight ++ w2_weight = layer.w2_weight ++ ++ origi_hidden_size = self.hidden_size - self.hidden_pad ++ ++ x = torch.nn.functional.pad( ++ x, ++ (0, self.hidden_pad), ++ mode="constant", ++ value=0.0, ++ ) ++ ++ output = fused_moe( ++ x, ++ w13_weight, ++ w2_weight, ++ topk_weights, ++ topk_ids, ++ expert_mask=layer.expert_mask_gpu, ++ activation=ActivationType.Swiglu, ++ quant_type=QuantType.per_1x32, ++ w1_scale=layer.w13_weight_scale, ++ w2_scale=layer.w2_weight_scale, ++ doweight_stage1=self.moe_runner_config.apply_router_weight_on_input, ++ hidden_pad=self.hidden_pad, ++ intermediate_pad=self.intermediate_pad, ++ bias1=layer.w13_weight_bias, ++ bias2=layer.w2_weight_bias, ++ ) ++ return StandardCombineInput(hidden_states=output) + + backend = self.runner.runner_backend + if backend.is_triton_kernels(): +diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py +index 49975de64..2230a9dae 100644 +--- a/python/sglang/srt/server_args.py ++++ b/python/sglang/srt/server_args.py +@@ -1358,6 +1358,13 @@ class ServerArgs: + logger.warning( + "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel." + ) ++ elif ( ++ is_hip() and get_bool_env_var("SGLANG_USE_AITER") ++ ) and is_mxfp4_quant_format: ++ self.moe_runner_backend = "auto" ++ logger.warning( ++ "Detected ROCm and MXFP4 quantization format for GPT-OSS model, enabling aiter MXFP4 MOE kernel." ++ ) + elif self.ep_size == 1 and is_triton_kernels_available(): + self.moe_runner_backend = "triton_kernel" + logger.warning( +-- +2.34.1 + diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py index 7fde05894b59..6b5b724c9cd6 100644 --- a/python/sglang/srt/layers/layernorm.py +++ b/python/sglang/srt/layers/layernorm.py @@ -64,11 +64,21 @@ gemma_rmsnorm, rmsnorm, ) +_vllm_layernorm_available = False +rms_norm = None +fused_add_rms_norm = None + if _use_aiter: from aiter import rmsnorm2d_fwd as rms_norm from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm elif _is_hip: - from vllm._custom_ops import fused_add_rms_norm, rms_norm + try: + from vllm._custom_ops import fused_add_rms_norm, rms_norm + + _vllm_layernorm_available = True + except ImportError: + # Will use forward_native as fallback + pass logger = logging.getLogger(__name__) @@ -176,6 +186,10 @@ def forward_hip( residual: Optional[torch.Tensor] = None, post_residual_addition: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + # Fallback to native implementation if vllm is not available + if not _use_aiter and not _vllm_layernorm_available: + return self.forward_native(x, residual, post_residual_addition) + if not x.is_contiguous(): # NOTE: Remove this if aiter kernel supports discontinuous input x = x.contiguous() diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index a1885fade143..a21143b24705 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -53,8 +53,7 @@ from aiter import moe_sum except ImportError: raise ImportError("aiter is required when SGLANG_USE_AITER is set to True") - else: - from vllm import _custom_ops as vllm_ops + # No vllm import needed - using triton/torch.compile fallback for moe_sum padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0 @@ -492,9 +491,10 @@ def fused_experts_impl( activation, ) else: - vllm_ops.silu_and_mul( - intermediate_cache2, intermediate_cache1.view(-1, N) - ) + # Native PyTorch fallback for non-CUDA/HIP environments + x = intermediate_cache1.view(-1, N) + d = x.shape[-1] // 2 + intermediate_cache2.copy_(F.silu(x[..., :d]) * x[..., d:]) elif activation == "gelu" and is_gated: assert gemm1_alpha is None, "gemm1_alpha is not supported for gelu" assert gemm1_limit is None, "gemm1_limit is not supported for gelu" @@ -512,9 +512,10 @@ def fused_experts_impl( activation, ) else: - vllm_ops.gelu_and_mul( - intermediate_cache2, intermediate_cache1.view(-1, N) - ) + # Native PyTorch fallback for non-CUDA/HIP environments + x = intermediate_cache1.view(-1, N) + d = x.shape[-1] // 2 + intermediate_cache2.copy_(F.gelu(x[..., :d]) * x[..., d:]) # Activation function without multiplication elif activation == "silu" and not is_gated: intermediate_cache2 = F.silu(intermediate_cache1.view(-1, N)) @@ -607,9 +608,9 @@ def fused_experts_impl( routed_scaling_factor, ) else: - vllm_ops.moe_sum( - intermediate_cache3.view(*intermediate_cache3.shape), - out_hidden_states[begin_chunk_idx:end_chunk_idx], + # Native PyTorch fallback for non-CUDA/HIP environments + out_hidden_states[begin_chunk_idx:end_chunk_idx].copy_( + intermediate_cache3.view(*intermediate_cache3.shape).sum(dim=1) ) return out_hidden_states diff --git a/python/sglang/srt/layers/moe/moe_runner/triton.py b/python/sglang/srt/layers/moe/moe_runner/triton.py index cdf3e9a471f3..c527118c737a 100644 --- a/python/sglang/srt/layers/moe/moe_runner/triton.py +++ b/python/sglang/srt/layers/moe/moe_runner/triton.py @@ -36,6 +36,8 @@ _MOE_PADDING_SIZE = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0 +_vllm_moe_available = False + if _is_cuda or _is_hip: from sgl_kernel import gelu_and_mul, silu_and_mul @@ -48,7 +50,13 @@ "aiter is required when SGLANG_USE_AITER is set to True" ) else: - from vllm import _custom_ops as vllm_ops # moe_sum + try: + from vllm import _custom_ops as vllm_ops # moe_sum + + _vllm_moe_available = True + except ImportError: + # Will use triton fallback + pass elif _is_cpu and _is_cpu_amx_available: pass @@ -307,11 +315,22 @@ def run( intermediate_cache3.view(*intermediate_cache3.shape), out_hidden_states, ) - else: + elif _vllm_moe_available: vllm_ops.moe_sum( intermediate_cache3.view(*intermediate_cache3.shape), out_hidden_states, ) + else: + # Triton fallback when vllm is not available + from sglang.srt.layers.moe.fused_moe_triton.fused_moe_triton_kernels import ( + moe_sum_reduce_triton, + ) + + moe_sum_reduce_triton( + intermediate_cache3.view(*intermediate_cache3.shape), + out_hidden_states, + 1.0, # routed_scaling_factor + ) else: vllm_ops.moe_sum( intermediate_cache3.view(*intermediate_cache3.shape), diff --git a/python/sglang/srt/layers/quantization/fp8_kernel.py b/python/sglang/srt/layers/quantization/fp8_kernel.py index 7701f9757f52..2ce09e76e0b1 100644 --- a/python/sglang/srt/layers/quantization/fp8_kernel.py +++ b/python/sglang/srt/layers/quantization/fp8_kernel.py @@ -58,6 +58,8 @@ enable_sgl_per_token_group_quant_8bit = False +_vllm_available = False + if _is_hip: if _use_aiter: try: @@ -69,10 +71,14 @@ except ImportError: raise ImportError("aiter is required when SGLANG_USE_AITER is set to True") else: + # Try to import vllm for fp8 quant ops, but don't fail immediately + # The functions will raise ImportError when actually called if vllm is not available try: import vllm._C # noqa: F401 + + _vllm_available = True except ImportError: - raise ImportError("vllm is required when SGLANG_USE_AITER is set to False") + pass logger = logging.getLogger(__name__) @@ -1393,6 +1399,54 @@ def per_token_group_quant_mla_deep_gemm_masked_fp8( """ if _is_hip: + def _triton_dynamic_per_token_quant_fp8(output, input, scale): + """Triton fallback for dynamic per-token FP8 quantization.""" + M, N = input.shape + BLOCK = triton.next_power_of_2(N) + num_warps = min(max(BLOCK // 256, 1), 8) + eps = 1e-10 + if _is_hip: + bit8_max = 224.0 + else: + bit8_max = fp8_max + bit8_min = -bit8_max + _per_token_group_quant_8bit[(M,)]( + input, + output, + scale, + N, # group_size = N (per token) + N, + eps, + bit8_min=bit8_min, + bit8_max=bit8_max, + BLOCK=BLOCK, + num_warps=num_warps, + num_stages=1, + ) + + def _triton_dynamic_per_tensor_quant_fp8(output, input, scale): + """Triton fallback for dynamic per-tensor FP8 quantization.""" + # Compute scale from input + eps = 1e-10 + if _is_hip: + bit8_max = 224.0 + else: + bit8_max = fp8_max + absmax = torch.max(torch.abs(input)).item() + scale_val = max(absmax, eps) / bit8_max + scale.fill_(scale_val) + # Quantize with computed scale + output.copy_((input / scale_val).clamp(-bit8_max, bit8_max).to(output.dtype)) + + def _triton_static_quant_fp8(output, input, scale): + """Triton fallback for static FP8 quantization.""" + if _is_hip: + bit8_max = 224.0 + else: + bit8_max = fp8_max + scale_val = scale.item() + output.copy_((input / scale_val).clamp(-bit8_max, bit8_max).to(output.dtype)) + def scaled_fp8_quant( input: torch.Tensor, scale: Optional[torch.Tensor] = None, @@ -1413,16 +1467,22 @@ def scaled_fp8_quant( ) if _use_aiter: dynamic_per_token_scaled_quant(output, input, scale) - else: + elif _vllm_available: torch.ops._C.dynamic_per_token_scaled_fp8_quant( output, input.contiguous(), scale, None ) + else: + _triton_dynamic_per_token_quant_fp8( + output, input.contiguous(), scale + ) else: scale = torch.zeros(1, device=input.device, dtype=torch.float32) if _use_aiter: dynamic_per_tensor_quant(output, input, scale) - else: + elif _vllm_available: torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) + else: + _triton_dynamic_per_tensor_quant_fp8(output, input, scale) else: # Static scaling assert ( @@ -1430,8 +1490,10 @@ def scaled_fp8_quant( ), f"Expected scalar scale, got numel={scale.numel()}" if _use_aiter: static_per_tensor_quant(output, input, scale) - else: + elif _vllm_available: torch.ops._C.static_scaled_fp8_quant(output, input, scale) + else: + _triton_static_quant_fp8(output, input, scale) return output, scale diff --git a/scripts/ci/amd/amd_ci_start_container.sh b/scripts/ci/amd/amd_ci_start_container.sh index ad6cc198bf89..7539a80ac938 100755 --- a/scripts/ci/amd/amd_ci_start_container.sh +++ b/scripts/ci/amd/amd_ci_start_container.sh @@ -27,13 +27,25 @@ DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x" # Parse command line arguments MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}" MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}" +CUSTOM_IMAGE="" +BUILD_FROM_DOCKERFILE="" +GPU_ARCH_BUILD="" while [[ $# -gt 0 ]]; do case $1 in --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;; --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;; + --custom-image) CUSTOM_IMAGE="$2"; shift 2;; + --build-from-dockerfile) BUILD_FROM_DOCKERFILE="1"; shift;; + --gpu-arch) GPU_ARCH_BUILD="$2"; shift 2;; -h|--help) - echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]" + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --mi30x-base-tag TAG Override MI30x base image tag" + echo " --mi35x-base-tag TAG Override MI35x base image tag" + echo " --custom-image IMAGE Use a specific Docker image directly" + echo " --build-from-dockerfile Build image from docker/rocm.Dockerfile" + echo " --gpu-arch ARCH GPU architecture for Dockerfile build (e.g., gfx950-rocm720)" exit 0 ;; *) echo "Unknown option $1"; exit 1;; @@ -142,10 +154,50 @@ find_latest_image() { fi } -# Pull and run the latest image -IMAGE=$(find_latest_image "${GPU_ARCH}") -echo "Pulling Docker image: ${IMAGE}" -docker pull "${IMAGE}" +# Determine which image to use +if [[ -n "${CUSTOM_IMAGE}" ]]; then + # Use explicitly provided custom image + IMAGE="${CUSTOM_IMAGE}" + echo "Using custom image: ${IMAGE}" + docker pull "${IMAGE}" +elif [[ -n "${BUILD_FROM_DOCKERFILE}" ]]; then + # Build image from Dockerfile + if [[ -z "${GPU_ARCH_BUILD}" ]]; then + echo "Error: --gpu-arch is required when using --build-from-dockerfile" >&2 + exit 1 + fi + + DOCKERFILE_DIR="${GITHUB_WORKSPACE:-$PWD}/docker" + + # Use rocm720.Dockerfile for ROCm 7.2 builds, otherwise use rocm.Dockerfile + if [[ "${GPU_ARCH_BUILD}" == *"rocm720"* ]]; then + DOCKERFILE="${DOCKERFILE_DIR}/rocm720.Dockerfile" + else + DOCKERFILE="${DOCKERFILE_DIR}/rocm.Dockerfile" + fi + + if [[ ! -f "${DOCKERFILE}" ]]; then + echo "Error: Dockerfile not found at ${DOCKERFILE}" >&2 + exit 1 + fi + + IMAGE="sglang-ci:${GPU_ARCH_BUILD}-$(date +%Y%m%d)" + echo "Building Docker image from ${DOCKERFILE} with GPU_ARCH=${GPU_ARCH_BUILD}..." + + # Pass full GPU_ARCH (e.g., gfx950-rocm720) - Dockerfile handles stripping suffix + docker build \ + --build-arg GPU_ARCH="${GPU_ARCH_BUILD}" \ + --build-arg SGL_BRANCH="main" \ + -t "${IMAGE}" \ + -f "${DOCKERFILE}" \ + "${DOCKERFILE_DIR}" + echo "Successfully built image: ${IMAGE}" +else + # Find the latest pre-built image + IMAGE=$(find_latest_image "${GPU_ARCH}") + echo "Pulling Docker image: ${IMAGE}" + docker pull "${IMAGE}" +fi CACHE_HOST=/home/runner/sgl-data if [[ -d "$CACHE_HOST" ]]; then