diff --git a/.github/workflows/nightly-test-amd-rocm720.yml b/.github/workflows/nightly-test-amd-rocm720.yml index bc6328a2a2db..80d6a1afa324 100644 --- a/.github/workflows/nightly-test-amd-rocm720.yml +++ b/.github/workflows/nightly-test-amd-rocm720.yml @@ -21,46 +21,10 @@ on: type: boolean default: true job_filter: - description: 'Select which job to run (leave empty or "all" to run all jobs)' + description: 'Comma-separated list of jobs to run (e.g. "nightly-8-gpu-grok2-rocm720,nightly-8-gpu-deepseek-v31-rocm720"). Leave empty or "all" to run all jobs.' required: false - type: choice + type: string default: 'all' - options: - - 'all' - # MI30x ROCm 7.2 Unit Tests - - 'nightly-test-1-gpu-unit-rocm720' - # MI30x ROCm 7.2 Accuracy Tests (GSM8K / MMMU) - - 'nightly-accuracy-2-gpu-rocm720' - - 'nightly-accuracy-2-gpu-vlm-rocm720' - - 'nightly-perf-2-gpu-text-rocm720' - - 'nightly-perf-2-gpu-vlm-rocm720' - - 'nightly-accuracy-8-gpu-rocm720' - # MI30x ROCm 7.2 Accuracy + Performance Tests (combined) - - 'nightly-8-gpu-grok1-int4-rocm720' - - 'nightly-8-gpu-grok2-rocm720' - - 'nightly-8-gpu-deepseek-v31-rocm720' - - 'nightly-8-gpu-deepseek-v32-rocm720' - - 'nightly-8-gpu-deepseek-v32-mtp-rocm720' - - 'nightly-8-gpu-kimi-k25-rocm720' - - 'nightly-8-gpu-qwen3-235b-rocm720' - - 'nightly-8-gpu-qwen35-rocm720' - - 'nightly-8-gpu-glm5-rocm720' - - 'nightly-8-gpu-minimax-m25-rocm720' - # MI35x ROCm 7.2 jobs - - 'nightly-test-1-gpu-mi35x-rocm720' - - 'nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720' - - 'nightly-8-gpu-mi35x-qwen35-rocm720' - - 'nightly-accuracy-8-gpu-mi35x-rocm720' - - 'nightly-8-gpu-mi35x-grok1-int4-rocm720' - - 'nightly-8-gpu-mi35x-grok2-rocm720' - - 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720' - - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720' - - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720' - - 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720' - - 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720' - - 'nightly-8-gpu-mi35x-kimi-k25-rocm720' - - 'nightly-8-gpu-mi35x-glm5-rocm720' - - 'nightly-8-gpu-mi35x-minimax-m25-rocm720' workflow_call: inputs: ref: @@ -98,7 +62,7 @@ jobs: # ============================================== MI30x ROCm 7.2 Unit Tests ============================================== # 1-GPU Unit Tests - LoRA, debug utils, scheduler, etc. (MI30x ROCm 7.2) nightly-test-1-gpu-unit-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-unit-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-test-1-gpu-unit-rocm720,')) runs-on: linux-mi325-1gpu-sglang steps: - name: Checkout code @@ -127,7 +91,7 @@ jobs: # ============================================== MI30x ROCm 7.2 Accuracy Tests ============================================== # 2-GPU Accuracy Tests - GSM8K eval (MI30x ROCm 7.2) nightly-accuracy-2-gpu-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-2-gpu-rocm720,')) runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code @@ -155,7 +119,7 @@ jobs: # 2-GPU VLM Accuracy Tests - Vision-Language Models MMMU evaluation (ROCm 7.2) nightly-accuracy-2-gpu-vlm-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-vlm-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-2-gpu-vlm-rocm720,')) runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code @@ -184,7 +148,7 @@ jobs: # 2-GPU Text Models Performance Tests (ROCm 7.2) nightly-perf-2-gpu-text-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-text-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-2-gpu-text-rocm720,')) runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code @@ -214,7 +178,7 @@ jobs: # 2-GPU VLM Performance Tests (ROCm 7.2) nightly-perf-2-gpu-vlm-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-vlm-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-2-gpu-vlm-rocm720,')) runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code @@ -244,7 +208,7 @@ jobs: # 8-GPU Accuracy Tests - GPT-OSS, Grok1-FP8 (ROCm 7.2) nightly-accuracy-8-gpu-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -284,7 +248,7 @@ jobs: # ============================================== MI30x ROCm 7.2 Combined Accuracy + Performance Tests ============================================== # 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2 nightly-8-gpu-grok1-int4-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok1-int4-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-grok1-int4-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -327,7 +291,7 @@ jobs: # 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2 nightly-8-gpu-grok2-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok2-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-grok2-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -370,7 +334,7 @@ jobs: # 8-GPU DeepSeek-V3.1 (Accuracy + Performance) ROCm 7.2 nightly-8-gpu-deepseek-v31-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v31-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -413,7 +377,7 @@ jobs: # 8-GPU DeepSeek-V3.2 (Basic Accuracy + Perf) ROCm 7.2 nightly-8-gpu-deepseek-v32-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v32-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -454,7 +418,7 @@ jobs: # 8-GPU DeepSeek-V3.2 MTP (MTP Accuracy + Perf) ROCm 7.2 nightly-8-gpu-deepseek-v32-mtp-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32-mtp-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v32-mtp-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -493,9 +457,39 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + # 8-GPU DeepSeek-V3 KV FP8 (Basic + MTP with --kv-cache-dtype fp8_e4m3) ROCm 7.2 + nightly-8-gpu-deepseek-v3-kv-fp8-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v3-kv-fp8-rocm720,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + + - name: DeepSeek-V3 KV FP8 Test ROCm 7.2 (8-GPU Basic + MTP) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-deepseek-v3-kv-fp8 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # 8-GPU Kimi-K2.5 (Accuracy) ROCm 7.2 nightly-8-gpu-kimi-k25-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-kimi-k25-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-kimi-k25-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -525,7 +519,7 @@ jobs: # 8-GPU Qwen3-235B (Accuracy + Performance) ROCm 7.2 nightly-8-gpu-qwen3-235b-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-qwen3-235b-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-qwen3-235b-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -555,7 +549,7 @@ jobs: # 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2 nightly-8-gpu-qwen35-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-qwen35-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-qwen35-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -587,7 +581,7 @@ jobs: # 8-GPU GLM-5 (Accuracy) ROCm 7.2 nightly-8-gpu-glm5-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-glm5-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-glm5-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -620,7 +614,7 @@ jobs: # 8-GPU MiniMax-M2.5 (Accuracy) ROCm 7.2 nightly-8-gpu-minimax-m25-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-minimax-m25-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-minimax-m25-rocm720,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -652,7 +646,7 @@ jobs: # ============================================== MI35x ROCm 7.2 Tests ============================================== # MI35x 1-GPU ROCm 7.2 tests nightly-test-1-gpu-mi35x-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-mi35x-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-test-1-gpu-mi35x-rocm720,')) runs-on: linux-mi35x-gpu-1 steps: - name: Checkout code @@ -681,7 +675,7 @@ jobs: # MI35x 8-GPU Accuracy Tests - GPT-OSS (ROCm 7.2) nightly-accuracy-8-gpu-mi35x-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -713,7 +707,7 @@ jobs: # MI35x 8-GPU Grok1-INT4 (Accuracy + Performance) ROCm 7.2 nightly-8-gpu-mi35x-grok1-int4-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok1-int4-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-grok1-int4-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -759,7 +753,7 @@ jobs: # MI35x 8-GPU Grok2 (Accuracy + Performance) ROCm 7.2 nightly-8-gpu-mi35x-grok2-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok2-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-grok2-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -805,7 +799,7 @@ jobs: # MI35x 8-GPU DeepSeek-R1-MXFP4 (Accuracy + Performance) ROCm 7.2 nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -847,9 +841,97 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + # MI35x 8-GPU DeepSeek-R1-MXFP4 KV FP8 (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 KV FP8) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 KV FP8) + timeout-minutes: 300 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion (Accuracy + Performance) ROCm 7.2 + nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker (ROCm 7.2) + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x ROCm 7.2 (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion) + timeout-minutes: 300 + continue-on-error: true + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # MI35x 8-GPU DeepSeek-V3.2 Accuracy Test (ROCm 7.2) nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -882,7 +964,7 @@ jobs: # MI35x 8-GPU DeepSeek-V3.2 TP+MTP Accuracy Test (ROCm 7.2) nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -915,7 +997,7 @@ jobs: # MI35x 8-GPU DeepSeek-V3.2 Performance Test (Basic) ROCm 7.2 nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -948,7 +1030,7 @@ jobs: # MI35x 8-GPU Kimi-K2.5 (Accuracy) ROCm 7.2 nightly-8-gpu-mi35x-kimi-k25-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-kimi-k25-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-kimi-k25-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -981,7 +1063,7 @@ jobs: # MI35x 8-GPU Qwen3-235B-MXFP4 (Accuracy + Performance) ROCm 7.2 nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-qwen3-235b-mxfp4-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -1014,7 +1096,7 @@ jobs: # MI35x 8-GPU Qwen 3.5 (Accuracy) ROCm 7.2 nightly-8-gpu-mi35x-qwen35-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-qwen35-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-qwen35-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -1046,7 +1128,7 @@ jobs: exit ${TEST_EXIT_CODE:-0} nightly-8-gpu-mi35x-glm5-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-glm5-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-glm5-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -1081,7 +1163,7 @@ jobs: # MI35x 8-GPU MiniMax-M2.5 (Accuracy) ROCm 7.2 nightly-8-gpu-mi35x-minimax-m25-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-minimax-m25-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-minimax-m25-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -1114,7 +1196,7 @@ jobs: # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP) ROCm 7.2 nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-8-gpu-mi35x-deepseek-v32-mtp-rocm720,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -1163,6 +1245,7 @@ jobs: - nightly-8-gpu-deepseek-v31-rocm720 - nightly-8-gpu-deepseek-v32-rocm720 - nightly-8-gpu-deepseek-v32-mtp-rocm720 + - nightly-8-gpu-deepseek-v3-kv-fp8-rocm720 - nightly-8-gpu-kimi-k25-rocm720 - nightly-8-gpu-qwen3-235b-rocm720 - nightly-8-gpu-qwen35-rocm720 @@ -1174,6 +1257,8 @@ jobs: - nightly-8-gpu-mi35x-grok1-int4-rocm720 - nightly-8-gpu-mi35x-grok2-rocm720 - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-rocm720 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8-rocm720 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion-rocm720 - nightly-accuracy-8-gpu-mi35x-deepseek-v32-rocm720 - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp-rocm720 - nightly-perf-8-gpu-mi35x-deepseek-v32-basic-rocm720 diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml index 3dc8f1270707..54e0202ccc5a 100644 --- a/.github/workflows/nightly-test-amd.yml +++ b/.github/workflows/nightly-test-amd.yml @@ -21,46 +21,10 @@ on: type: boolean default: true job_filter: - description: 'Select which job to run (leave empty or "all" to run all jobs)' + description: 'Comma-separated list of jobs to run (e.g. "nightly-8-gpu-grok2,nightly-8-gpu-deepseek-v31"). Leave empty or "all" to run all jobs.' required: false - type: choice + type: string default: 'all' - options: - - 'all' - # MI30x Unit Tests - - 'nightly-test-1-gpu-unit' - # MI30x Accuracy Tests (GSM8K / MMMU) - - 'nightly-accuracy-2-gpu' - - 'nightly-accuracy-2-gpu-vlm' - - 'nightly-perf-2-gpu-text' - - 'nightly-perf-2-gpu-vlm' - - 'nightly-accuracy-8-gpu' - # MI30x Accuracy + Performance Tests (combined) - - 'nightly-8-gpu-grok1-int4' - - 'nightly-8-gpu-grok2' - - 'nightly-8-gpu-deepseek-v31' - - 'nightly-8-gpu-deepseek-v32' - - 'nightly-8-gpu-deepseek-v32-mtp' - - 'nightly-8-gpu-kimi-k25' - - 'nightly-8-gpu-qwen3-235b' - - 'nightly-8-gpu-qwen35' - - 'nightly-8-gpu-glm5' - - 'nightly-8-gpu-minimax-m25' - # MI35x jobs - - 'nightly-test-1-gpu-mi35x' - - 'nightly-8-gpu-mi35x-qwen3-235b-mxfp4' - - 'nightly-8-gpu-mi35x-qwen35' - - 'nightly-8-gpu-mi35x-kimi-k25' - - 'nightly-8-gpu-mi35x-glm5' - - 'nightly-8-gpu-mi35x-minimax-m25' - - 'nightly-accuracy-8-gpu-mi35x' - - 'nightly-8-gpu-mi35x-grok1-int4' - - 'nightly-8-gpu-mi35x-grok2' - - 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4' - - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32' - - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp' - - 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic' - - 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp' workflow_call: inputs: ref: @@ -98,7 +62,7 @@ jobs: # ============================================== MI30x Unit Tests ============================================== # 1-GPU Unit Tests - LoRA, debug utils, scheduler, etc. (MI30x only) nightly-test-1-gpu-unit: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-unit') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-test-1-gpu-unit,')) runs-on: linux-mi325-1gpu-sglang steps: - name: Checkout code @@ -128,7 +92,7 @@ jobs: # ============================================== MI30x Accuracy Tests ============================================== # 2-GPU Accuracy Tests - GSM8K eval (MI30x only) nightly-accuracy-2-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-2-gpu,')) runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code @@ -157,7 +121,7 @@ jobs: # 2-GPU VLM Accuracy Tests - Vision-Language Models MMMU evaluation nightly-accuracy-2-gpu-vlm: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-vlm') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-2-gpu-vlm,')) runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code @@ -187,7 +151,7 @@ jobs: # 2-GPU Text Models Performance Tests nightly-perf-2-gpu-text: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-text') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-2-gpu-text,')) runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code @@ -218,7 +182,7 @@ jobs: # 2-GPU VLM Performance Tests nightly-perf-2-gpu-vlm: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-vlm') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-2-gpu-vlm,')) runs-on: linux-mi325-2gpu-sglang steps: - name: Checkout code @@ -249,7 +213,7 @@ jobs: # 8-GPU Accuracy Tests - GPT-OSS, Grok1-FP8 (accuracy only) nightly-accuracy-8-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -289,7 +253,7 @@ jobs: # ============================================== MI30x Combined Accuracy + Performance Tests ============================================== # 8-GPU Grok1-INT4 (Accuracy + Performance combined) nightly-8-gpu-grok1-int4: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok1-int4') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-grok1-int4,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -332,7 +296,7 @@ jobs: # 8-GPU Grok2 (Accuracy + Performance combined) nightly-8-gpu-grok2: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok2') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-grok2,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -375,7 +339,7 @@ jobs: # 8-GPU DeepSeek-V3.1 (Accuracy + Performance combined) nightly-8-gpu-deepseek-v31: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v31,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -418,7 +382,7 @@ jobs: # 8-GPU DeepSeek-V3.2 (Basic Accuracy + Perf) nightly-8-gpu-deepseek-v32: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v32,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -459,7 +423,7 @@ jobs: # 8-GPU DeepSeek-V3.2 MTP (MTP Accuracy + Perf) nightly-8-gpu-deepseek-v32-mtp: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32-mtp') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v32-mtp,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -498,9 +462,39 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + # 8-GPU DeepSeek-V3 KV FP8 (Basic + MTP with --kv-cache-dtype fp8_e4m3) + nightly-8-gpu-deepseek-v3-kv-fp8: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-deepseek-v3-kv-fp8,')) + runs-on: linux-mi325-8gpu-sglang + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: DeepSeek-V3 KV FP8 Test (8-GPU Basic + MTP) + timeout-minutes: 120 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-deepseek-v3-kv-fp8 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # 8-GPU Kimi-K2.5 (Accuracy) nightly-8-gpu-kimi-k25: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-kimi-k25') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-kimi-k25,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -529,7 +523,7 @@ jobs: exit ${TEST_EXIT_CODE:-0} nightly-8-gpu-qwen3-235b: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-qwen3-235b') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-qwen3-235b,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -559,7 +553,7 @@ jobs: # 8-GPU Qwen 3.5 (Accuracy) nightly-8-gpu-qwen35: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-qwen35') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-qwen35,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -590,7 +584,7 @@ jobs: exit ${TEST_EXIT_CODE:-0} nightly-8-gpu-glm5: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-glm5') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-glm5,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -623,7 +617,7 @@ jobs: # 8-GPU MiniMax-M2.5 (Accuracy) nightly-8-gpu-minimax-m25: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-minimax-m25') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-minimax-m25,')) runs-on: linux-mi325-8gpu-sglang steps: - name: Checkout code @@ -655,7 +649,7 @@ jobs: # ============================================== MI35x Tests ============================================== # MI35x 1-GPU tests - platform-agnostic tests that may work on CDNA4 (gfx950) nightly-test-1-gpu-mi35x: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-mi35x') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-test-1-gpu-mi35x,')) runs-on: linux-mi35x-gpu-1 steps: - name: Checkout code @@ -687,7 +681,7 @@ jobs: # MI35x 8-GPU Accuracy Tests - GPT-OSS (accuracy only) nightly-accuracy-8-gpu-mi35x: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -719,7 +713,7 @@ jobs: # MI35x 8-GPU Grok1-INT4 (Accuracy + Performance combined) nightly-8-gpu-mi35x-grok1-int4: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok1-int4') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-grok1-int4,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -765,7 +759,7 @@ jobs: # MI35x 8-GPU Grok2 (Accuracy + Performance combined) nightly-8-gpu-mi35x-grok2: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok2') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-grok2,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -811,7 +805,7 @@ jobs: # MI35x 8-GPU DeepSeek-R1-MXFP4 (Accuracy + Performance combined) nightly-8-gpu-mi35x-deepseek-r1-mxfp4: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -853,9 +847,97 @@ jobs: echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} + # MI35x 8-GPU DeepSeek-R1-MXFP4 KV FP8 (Accuracy + Performance combined) + nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x (8-GPU DeepSeek-R1-MXFP4 KV FP8) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x (8-GPU DeepSeek-R1-MXFP4 KV FP8) + timeout-minutes: 300 + continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion (Accuracy + Performance combined) + nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion,')) + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Accuracy Test MI35x (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion) + timeout-minutes: 180 + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x (8-GPU DeepSeek-R1-MXFP4 AllReduce Fusion) + timeout-minutes: 300 + continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed + run: | + > github_summary.md # Clear summary file + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + # MI35x 8-GPU DeepSeek-V3.2 Accuracy Test nightly-accuracy-8-gpu-mi35x-deepseek-v32: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -888,7 +970,7 @@ jobs: # MI35x 8-GPU DeepSeek-V3.2 TP+MTP Accuracy Test nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -921,7 +1003,7 @@ jobs: # MI35x 8-GPU DeepSeek-V3.2 Performance Test (Basic) nightly-perf-8-gpu-mi35x-deepseek-v32-basic: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-8-gpu-mi35x-deepseek-v32-basic,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -954,7 +1036,7 @@ jobs: # MI35x 8-GPU Kimi-K2.5 (Accuracy) nightly-8-gpu-mi35x-kimi-k25: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-kimi-k25') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-kimi-k25,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -987,7 +1069,7 @@ jobs: # MI35x 8-GPU Qwen3-235B-MXFP4 (Accuracy + Performance) nightly-8-gpu-mi35x-qwen3-235b-mxfp4: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-qwen3-235b-mxfp4') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-qwen3-235b-mxfp4,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -1020,7 +1102,7 @@ jobs: # MI35x 8-GPU Qwen 3.5 (Accuracy) nightly-8-gpu-mi35x-qwen35: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-qwen35') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-qwen35,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -1052,7 +1134,7 @@ jobs: exit ${TEST_EXIT_CODE:-0} nightly-8-gpu-mi35x-glm5: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-glm5') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-glm5,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -1087,7 +1169,7 @@ jobs: # MI35x 8-GPU MiniMax-M2.5 (Accuracy) nightly-8-gpu-mi35x-minimax-m25: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-minimax-m25') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-8-gpu-mi35x-minimax-m25,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -1120,7 +1202,7 @@ jobs: # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP) nightly-perf-8-gpu-mi35x-deepseek-v32-mtp: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp') + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || contains(format(',{0},', inputs.job_filter), ',nightly-perf-8-gpu-mi35x-deepseek-v32-mtp,')) runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code @@ -1169,6 +1251,7 @@ jobs: - nightly-8-gpu-deepseek-v31 - nightly-8-gpu-deepseek-v32 - nightly-8-gpu-deepseek-v32-mtp + - nightly-8-gpu-deepseek-v3-kv-fp8 - nightly-8-gpu-kimi-k25 - nightly-8-gpu-qwen3-235b - nightly-8-gpu-qwen35 @@ -1180,6 +1263,8 @@ jobs: - nightly-8-gpu-mi35x-grok1-int4 - nightly-8-gpu-mi35x-grok2 - nightly-8-gpu-mi35x-deepseek-r1-mxfp4 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion - nightly-accuracy-8-gpu-mi35x-deepseek-v32 - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp - nightly-8-gpu-mi35x-kimi-k25 diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml index b566e1d84f5d..c8e848b99120 100644 --- a/.github/workflows/pr-test-amd-rocm720.yml +++ b/.github/workflows/pr-test-amd-rocm720.yml @@ -28,7 +28,7 @@ on: workflow_dispatch: inputs: target_stage: - description: "Specific stage to run (optional, for quick testing)" + description: "Specific stage(s) to run, comma-separated (e.g. 'stage-a-test-1-amd,stage-b-test-small-1-gpu-amd')" required: false type: string default: "" @@ -144,7 +144,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'sgl-kernel-unit-test-amd') || + (contains(format(',{0},', inputs.target_stage), ',sgl-kernel-unit-test-amd,')) || ( !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' @@ -190,7 +190,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'sgl-kernel-unit-test-2-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',sgl-kernel-unit-test-2-gpu-amd,')) || ( !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' @@ -231,7 +231,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-a-test-1-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-a-test-1-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -270,7 +270,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'jit-kernel-unit-test-amd') || + (contains(format(',{0},', inputs.target_stage), ',jit-kernel-unit-test-amd,')) || ( !inputs.target_stage && needs.check-changes.outputs.jit_kernel == 'true' @@ -308,7 +308,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-small-1-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -319,7 +319,7 @@ jobs: fail-fast: false matrix: runner: [linux-mi325-1gpu-sglang] - part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] runs-on: ${{matrix.runner}} steps: - name: Checkout code @@ -340,14 +340,14 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-small-1-gpu-amd-nondeterministic: needs: [check-changes] if: | always() && ( - (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-nondeterministic') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd-nondeterministic,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -385,7 +385,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-mi35x') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd-mi35x,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -423,7 +423,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-large-1-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-large-1-gpu-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -462,7 +462,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-large-2-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-large-2-gpu-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -501,7 +501,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'multimodal-gen-test-1-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',multimodal-gen-test-1-gpu-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -631,7 +631,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'multimodal-gen-test-2-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',multimodal-gen-test-2-gpu-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -760,7 +760,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-c-test-large-8-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-c-test-large-8-gpu-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -807,7 +807,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-c-test-large-8-gpu-amd-mi35x') || + (contains(format(',{0},', inputs.target_stage), ',stage-c-test-large-8-gpu-amd-mi35x,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -841,6 +841,118 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + # =============================================== Disaggregation ==================================================== + stage-b-test-large-8-gpu-35x-disaggregation-amd: + needs: [check-changes] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-large-8-gpu-disaggregation-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi35x-gpu-8.fabric] + + runs-on: ${{matrix.runner}} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Check Host RDMA Environment + id: rdma_detect + run: | + set +e + echo "=== Checking Host RDMA Environment ===" + + echo "" + echo "=== 1. Ionic driver library check ===" + ls -l /usr/lib/x86_64-linux-gnu/libibverbs/libionic* 2>/dev/null || echo "libionic not found in standard path" + + echo "" + echo "=== 2. Infiniband devices ===" + ls -la /dev/infiniband/ 2>/dev/null || echo "/dev/infiniband not found" + ls -la /sys/class/infiniband/ 2>/dev/null || echo "/sys/class/infiniband not found" + + echo "" + echo "=== 3. ibv_devinfo ===" + which ibv_devinfo 2>/dev/null && ibv_devinfo 2>&1 || echo "ibv_devinfo not available" + + echo "" + echo "=== 4. Kernel modules ===" + lsmod 2>/dev/null | grep -E "ib_|rdma|ionic" || echo "No RDMA kernel modules loaded" + + echo "" + echo "=== 5. Detect RDMA Devices for test environment ===" + if [ -d "/sys/class/infiniband" ]; then + RDMA_DEVS=$(ls /sys/class/infiniband | paste -sd "," -) + echo "Detected RDMA Devices: $RDMA_DEVS" + echo "SGLANG_TEST_RDMA_DEVICE=$RDMA_DEVS" >> $GITHUB_ENV + else + echo "No RDMA devices found in /sys/class/infiniband" + echo "SGLANG_TEST_RDMA_DEVICE=" >> $GITHUB_ENV + fi + + echo "" + echo "=== Host RDMA Check Complete ===" + + - name: Start Special Container + run: bash scripts/ci/amd/amd_ci_start_container_disagg.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify RDMA in Container + run: | + docker exec -u root ci_sglang bash -c ' + echo "=== Container RDMA Verification ===" + echo "Device nodes:" + ls -la /dev/infiniband/ + echo "" + echo "Provider libraries:" + ls /usr/lib/x86_64-linux-gnu/libibverbs/ | grep -E "ionic|mlx" || echo "No Ionic/Mellanox providers" + echo "" + echo "HCA devices:" + HCA_COUNT=$(ibv_devinfo -list 2>&1 | grep -oE "^[0-9]+ HCAs? found" | grep -oE "^[0-9]+" || echo "0") + ibv_devinfo -list + if [ "$HCA_COUNT" -gt 0 ]; then + echo "" + echo "=== SUCCESS: RDMA setup complete. Found $HCA_COUNT HCA(s) ===" + else + echo "" + echo "=== WARNING: No HCAs detected. RDMA tests may fail ===" + fi + ' + + - name: Run Aiter Op Test (RMSNorm) + timeout-minutes: 10 + run: | + echo "Running pre-check: test_rmsnorm2d.py" + docker exec \ + -e MAX_JOBS=192 \ + ci_sglang \ + python /sgl-workspace/aiter/op_tests/test_rmsnorm2d.py + + - name: Run test_disaggregation + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh \ + -e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \ + -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} + pr-test-amd-finish: needs: [ @@ -859,6 +971,7 @@ jobs: stage-b-test-small-1-gpu-amd-mi35x, stage-b-test-large-1-gpu-amd, stage-b-test-large-2-gpu-amd, + stage-b-test-large-8-gpu-35x-disaggregation-amd, stage-c-test-large-8-gpu-amd, stage-c-test-large-8-gpu-amd-mi35x, ] diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index a803a1ed1c45..415042b473e6 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -25,7 +25,7 @@ on: workflow_dispatch: inputs: target_stage: - description: "Specific stage to run (optional, for quick testing)" + description: "Specific stage(s) to run, comma-separated (e.g. 'stage-a-test-1-amd,stage-b-test-small-1-gpu-amd')" required: false type: string default: "" @@ -141,7 +141,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'sgl-kernel-unit-test-amd') || + (contains(format(',{0},', inputs.target_stage), ',sgl-kernel-unit-test-amd,')) || ( !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' @@ -188,7 +188,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'sgl-kernel-unit-test-2-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',sgl-kernel-unit-test-2-gpu-amd,')) || ( !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' @@ -230,7 +230,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-a-test-1-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-a-test-1-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -270,7 +270,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'jit-kernel-unit-test-amd') || + (contains(format(',{0},', inputs.target_stage), ',jit-kernel-unit-test-amd,')) || ( !inputs.target_stage && needs.check-changes.outputs.jit_kernel == 'true' @@ -309,7 +309,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-small-1-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -349,7 +349,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-nondeterministic') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd-nondeterministic,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -388,7 +388,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-mi35x') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-small-1-gpu-amd-mi35x,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -427,7 +427,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-large-1-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-large-1-gpu-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -467,7 +467,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-large-2-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-large-2-gpu-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -504,7 +504,15 @@ jobs: multimodal-gen-test-1-gpu-amd: needs: [check-changes] - if: needs.check-changes.outputs.multimodal_gen == 'true' + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage), ',multimodal-gen-test-1-gpu-amd,')) || + ( + !inputs.target_stage && + needs.check-changes.outputs.multimodal_gen == 'true' + ) + ) strategy: fail-fast: false max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT @@ -624,7 +632,15 @@ jobs: multimodal-gen-test-2-gpu-amd: needs: [check-changes] - if: needs.check-changes.outputs.multimodal_gen == 'true' + if: | + always() && + ( + (contains(format(',{0},', inputs.target_stage), ',multimodal-gen-test-2-gpu-amd,')) || + ( + !inputs.target_stage && + needs.check-changes.outputs.multimodal_gen == 'true' + ) + ) strategy: fail-fast: false max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT @@ -746,7 +762,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-c-test-large-8-gpu-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-c-test-large-8-gpu-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -794,7 +810,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-c-test-large-8-gpu-amd-mi35x') || + (contains(format(',{0},', inputs.target_stage), ',stage-c-test-large-8-gpu-amd-mi35x,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && @@ -835,7 +851,7 @@ jobs: if: | always() && ( - (inputs.target_stage == 'stage-b-test-large-8-gpu-disaggregation-amd') || + (contains(format(',{0},', inputs.target_stage), ',stage-b-test-large-8-gpu-disaggregation-amd,')) || ( !inputs.target_stage && (!failure() && !cancelled()) && diff --git a/scripts/ci/amd/amd_ci_start_container_disagg.sh b/scripts/ci/amd/amd_ci_start_container_disagg.sh index ecf24f652e9f..70de85dff91e 100755 --- a/scripts/ci/amd/amd_ci_start_container_disagg.sh +++ b/scripts/ci/amd/amd_ci_start_container_disagg.sh @@ -32,8 +32,14 @@ while [[ $# -gt 0 ]]; do case $1 in --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;; --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;; + --rocm-version) + ROCM_VERSION="$2" + MI30X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi30x" + MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x" + echo "Using ROCm version override: ${ROCM_VERSION}" + shift 2;; -h|--help) - echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG]" + echo "Usage: $0 [--mi30x-base-tag TAG] [--mi35x-base-tag TAG] [--rocm-version VERSION]" exit 0 ;; *) echo "Unknown option $1"; exit 1;; @@ -134,12 +140,27 @@ find_latest_image() { fi echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2 - echo "Using hard-coded fallback…" >&2 - if [[ "${gpu_arch}" == "mi35x" ]]; then - echo "rocm/sgl-dev:v0.5.5-rocm700-mi35x-20251110" - else - echo "rocm/sgl-dev:v0.5.5-rocm700-mi30x-20251110" - fi + echo "Using hard-coded fallback for ${ROCM_VERSION}…" >&2 + case "${ROCM_VERSION}" in + rocm720) + if [[ "${gpu_arch}" == "mi35x" ]]; then + echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260211-preview" + else + echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi30x-20260211-preview" + fi + ;; + rocm700) + if [[ "${gpu_arch}" == "mi35x" ]]; then + echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi35x-20260211" + else + echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi30x-20260211" + fi + ;; + *) + echo "Error: no hard-coded fallback available for ${ROCM_VERSION}" >&2 + return 1 + ;; + esac } # Pull and run the latest image diff --git a/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_ar_fusion_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_ar_fusion_eval_mi35x.py new file mode 100644 index 000000000000..1636d27cf27e --- /dev/null +++ b/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_ar_fusion_eval_mi35x.py @@ -0,0 +1,280 @@ +"""MI35x DeepSeek-R1-MXFP4 GSM8K Completion Evaluation Test with AIter AllReduce Fusion (8-GPU) + +Tests DeepSeek-R1-MXFP4 quantized model with --enable-aiter-allreduce-fusion +using few-shot completion benchmark on MI35x. + +Registry: nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion suite +""" + +import ast +import os + +# Set HF cache for MI35x +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") + +import re +import time +import unittest +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import numpy as np + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + is_in_ci, + popen_launch_server, + write_github_step_summary, +) +from sglang.utils import download_and_cache_file, read_jsonl + +# Register for AMD CI - MI35x DeepSeek-R1-MXFP4 AllReduce Fusion accuracy test (~60 min) +register_amd_ci( + est_time=3600, + suite="nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion", + nightly=True, +) + +INVALID = -9999999 + +# Model path configuration for MI35x DeepSeek-R1-MXFP4 +# Priority: 1) env var, 2) local path, 3) HuggingFace model ID +DEEPSEEK_R1_MXFP4_LOCAL_PATH = "/data2/models/amd-DeepSeek-R1-MXFP4-Preview" +DEEPSEEK_R1_MXFP4_HF_MODEL_ID = "amd/DeepSeek-R1-MXFP4-Preview" + + +def get_model_path() -> str: + """Get effective model path: env var > local path > HF model ID.""" + env_path = os.environ.get("DEEPSEEK_R1_MXFP4_MODEL_PATH") + if env_path: + return env_path + if os.path.exists(DEEPSEEK_R1_MXFP4_LOCAL_PATH): + return DEEPSEEK_R1_MXFP4_LOCAL_PATH + return DEEPSEEK_R1_MXFP4_HF_MODEL_ID + + +@dataclass +class ModelConfig: + """Configuration for a model to test.""" + + model_path: str + tp_size: int = 8 + accuracy_threshold: float = 0.50 + other_args: Optional[List[str]] = None + env_vars: Optional[dict] = None + timeout: Optional[int] = None + variant: Optional[str] = None + + def __post_init__(self): + if self.other_args is None: + self.other_args = [] + if self.env_vars is None: + self.env_vars = {} + + def get_display_name(self) -> str: + if self.variant: + return f"{self.model_path} ({self.variant})" + return self.model_path + + +def get_mxfp4_models() -> List[ModelConfig]: + """Get DeepSeek-R1-MXFP4 model configurations for MI35x with AllReduce Fusion.""" + model_path = get_model_path() + return [ + ModelConfig( + model_path=model_path, + tp_size=8, + accuracy_threshold=0.93, + timeout=3600, + variant="ar-fusion", + other_args=[ + "--attention-backend", + "aiter", + "--chunked-prefill-size", + "131072", + "--disable-radix-cache", + "--mem-fraction-static", + "0.85", + "--trust-remote-code", + "--enable-aiter-allreduce-fusion", + ], + env_vars={"SGLANG_USE_AITER": "1"}, + ), + ] + + +def get_one_example(lines, i, include_answer): + """Format a single GSM8K example.""" + ret = "Question: " + lines[i]["question"] + "\nAnswer:" + if include_answer: + ret += " " + lines[i]["answer"] + return ret + + +def get_few_shot_examples(lines, k): + """Get k few-shot examples for prompting.""" + ret = "" + for i in range(k): + ret += get_one_example(lines, i, True) + "\n\n" + return ret + + +def get_answer_value(answer_str): + """Extract numerical answer from response.""" + answer_str = answer_str.replace(",", "") + numbers = re.findall(r"\d+", answer_str) + if len(numbers) < 1: + return INVALID + try: + return ast.literal_eval(numbers[-1]) + except SyntaxError: + return INVALID + + +def run_gsm8k_benchmark( + base_url: str, + num_questions: int = 200, + num_shots: int = 5, + parallel: int = 64, +) -> Tuple[float, float, float]: + """Run GSM8K few-shot completion benchmark.""" + import sglang as sgl + from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint + + url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl" + data_path = download_and_cache_file(url) + lines = list(read_jsonl(data_path)) + + few_shot_examples = get_few_shot_examples(lines, num_shots) + + questions = [] + labels = [] + for i in range(len(lines[:num_questions])): + questions.append(get_one_example(lines, i, False)) + labels.append(get_answer_value(lines[i]["answer"])) + assert all(l != INVALID for l in labels) + arguments = [{"question": q} for q in questions] + + @sgl.function + def few_shot_gsm8k(s, question): + s += few_shot_examples + question + s += sgl.gen( + "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"] + ) + + backend = RuntimeEndpoint(base_url) + sgl.set_default_backend(backend) + + tic = time.perf_counter() + states = few_shot_gsm8k.run_batch( + arguments, temperature=0, num_threads=parallel, progress_bar=True + ) + latency = time.perf_counter() - tic + + preds = [get_answer_value(states[i]["answer"]) for i in range(len(states))] + acc = np.mean(np.array(preds) == np.array(labels)) + invalid = np.mean(np.array(preds) == INVALID) + + return float(acc), float(invalid), float(latency) + + +class TestDeepSeekR1MXFP4ArFusionEvalMI35x(unittest.TestCase): + """DeepSeek-R1-MXFP4 GSM8K Evaluation with AllReduce Fusion for AMD MI35x.""" + + @classmethod + def setUpClass(cls): + cls.models = get_mxfp4_models() + cls.base_url = DEFAULT_URL_FOR_TEST + cls.num_questions = int(os.environ.get("GSM8K_NUM_QUESTIONS", "200")) + + def test_deepseek_r1_mxfp4_ar_fusion_accuracy(self): + """Test DeepSeek-R1-MXFP4 models with AllReduce Fusion on GSM8K.""" + # Check if model exists + model_path = get_model_path() + is_local_path = model_path.startswith("/") + if is_local_path and not os.path.exists(model_path): + print(f"\n⏭️ SKIPPING: Local model not found at {model_path}") + self.skipTest(f"Local model not found at {model_path}") + return + + if is_local_path: + print(f"📁 Using local model: {model_path}") + else: + print(f"📥 Using HuggingFace model: {model_path}") + + all_results = [] + summary = "### DeepSeek-R1-MXFP4 AllReduce Fusion Models (MI35x)\n\n" + summary += "| Model | Variant | TP | Accuracy | Threshold | Status |\n" + summary += "| ----- | ------- | -- | -------- | --------- | ------ |\n" + + for config in self.models: + display_name = config.get_display_name() + with self.subTest(model=display_name): + print(f"\n{'='*60}") + print(f"Testing: {display_name}") + print(f"{'='*60}") + + env = os.environ.copy() + for key, value in config.env_vars.items(): + env[key] = value + + other_args = list(config.other_args) + other_args.extend(["--tp", str(config.tp_size)]) + timeout = config.timeout or DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH + + try: + process = popen_launch_server( + model=config.model_path, + base_url=self.base_url, + timeout=timeout, + other_args=other_args, + env=env, + ) + + try: + acc, invalid, latency = run_gsm8k_benchmark( + self.base_url, num_questions=self.num_questions + ) + passed = acc >= config.accuracy_threshold + status = "✅ PASS" if passed else "❌ FAIL" + print( + f" accuracy={acc:.3f} threshold={config.accuracy_threshold} {status}" + ) + + all_results.append( + { + "model": display_name, + "accuracy": acc, + "passed": passed, + } + ) + summary += f"| {config.model_path} | {config.variant or 'N/A'} | {config.tp_size} | {acc:.3f} | {config.accuracy_threshold} | {status} |\n" + + finally: + kill_process_tree(process.pid) + + except Exception as e: + summary += f"| {config.model_path} | {config.variant or 'N/A'} | {config.tp_size} | N/A | {config.accuracy_threshold} | ❌ ERROR |\n" + all_results.append( + { + "model": display_name, + "accuracy": None, + "passed": False, + "error": str(e), + } + ) + + if is_in_ci(): + write_github_step_summary(summary) + + failed = [r for r in all_results if not r["passed"]] + if failed: + raise AssertionError(f"Failed models: {[r['model'] for r in failed]}") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_kv_fp8_eval_mi35x.py b/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_kv_fp8_eval_mi35x.py new file mode 100644 index 000000000000..cb54e77528fa --- /dev/null +++ b/test/registered/amd/accuracy/mi35x/test_deepseek_r1_mxfp4_kv_fp8_eval_mi35x.py @@ -0,0 +1,281 @@ +"""MI35x DeepSeek-R1-MXFP4 GSM8K Completion Evaluation Test with KV Cache FP8 (8-GPU) + +Tests DeepSeek-R1-MXFP4 quantized model with --kv-cache-dtype fp8_e4m3 +using few-shot completion benchmark on MI35x. + +Registry: nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 suite +""" + +import ast +import os + +# Set HF cache for MI35x +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") + +import re +import time +import unittest +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import numpy as np + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + is_in_ci, + popen_launch_server, + write_github_step_summary, +) +from sglang.utils import download_and_cache_file, read_jsonl + +# Register for AMD CI - MI35x DeepSeek-R1-MXFP4 KV FP8 accuracy test (~60 min) +register_amd_ci( + est_time=3600, + suite="nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8", + nightly=True, +) + +INVALID = -9999999 + +# Model path configuration for MI35x DeepSeek-R1-MXFP4 +# Priority: 1) env var, 2) local path, 3) HuggingFace model ID +DEEPSEEK_R1_MXFP4_LOCAL_PATH = "/data2/models/amd-DeepSeek-R1-MXFP4-Preview" +DEEPSEEK_R1_MXFP4_HF_MODEL_ID = "amd/DeepSeek-R1-MXFP4-Preview" + + +def get_model_path() -> str: + """Get effective model path: env var > local path > HF model ID.""" + env_path = os.environ.get("DEEPSEEK_R1_MXFP4_MODEL_PATH") + if env_path: + return env_path + if os.path.exists(DEEPSEEK_R1_MXFP4_LOCAL_PATH): + return DEEPSEEK_R1_MXFP4_LOCAL_PATH + return DEEPSEEK_R1_MXFP4_HF_MODEL_ID + + +@dataclass +class ModelConfig: + """Configuration for a model to test.""" + + model_path: str + tp_size: int = 8 + accuracy_threshold: float = 0.50 + other_args: Optional[List[str]] = None + env_vars: Optional[dict] = None + timeout: Optional[int] = None + variant: Optional[str] = None + + def __post_init__(self): + if self.other_args is None: + self.other_args = [] + if self.env_vars is None: + self.env_vars = {} + + def get_display_name(self) -> str: + if self.variant: + return f"{self.model_path} ({self.variant})" + return self.model_path + + +def get_mxfp4_models() -> List[ModelConfig]: + """Get DeepSeek-R1-MXFP4 model configurations for MI35x with KV cache FP8.""" + model_path = get_model_path() + return [ + ModelConfig( + model_path=model_path, + tp_size=8, + accuracy_threshold=0.93, + timeout=3600, + variant="kv-fp8", + other_args=[ + "--attention-backend", + "aiter", + "--chunked-prefill-size", + "131072", + "--disable-radix-cache", + "--mem-fraction-static", + "0.85", + "--trust-remote-code", + "--kv-cache-dtype", + "fp8_e4m3", + ], + env_vars={"SGLANG_USE_AITER": "1"}, + ), + ] + + +def get_one_example(lines, i, include_answer): + """Format a single GSM8K example.""" + ret = "Question: " + lines[i]["question"] + "\nAnswer:" + if include_answer: + ret += " " + lines[i]["answer"] + return ret + + +def get_few_shot_examples(lines, k): + """Get k few-shot examples for prompting.""" + ret = "" + for i in range(k): + ret += get_one_example(lines, i, True) + "\n\n" + return ret + + +def get_answer_value(answer_str): + """Extract numerical answer from response.""" + answer_str = answer_str.replace(",", "") + numbers = re.findall(r"\d+", answer_str) + if len(numbers) < 1: + return INVALID + try: + return ast.literal_eval(numbers[-1]) + except SyntaxError: + return INVALID + + +def run_gsm8k_benchmark( + base_url: str, + num_questions: int = 200, + num_shots: int = 5, + parallel: int = 64, +) -> Tuple[float, float, float]: + """Run GSM8K few-shot completion benchmark.""" + import sglang as sgl + from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint + + url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl" + data_path = download_and_cache_file(url) + lines = list(read_jsonl(data_path)) + + few_shot_examples = get_few_shot_examples(lines, num_shots) + + questions = [] + labels = [] + for i in range(len(lines[:num_questions])): + questions.append(get_one_example(lines, i, False)) + labels.append(get_answer_value(lines[i]["answer"])) + assert all(l != INVALID for l in labels) + arguments = [{"question": q} for q in questions] + + @sgl.function + def few_shot_gsm8k(s, question): + s += few_shot_examples + question + s += sgl.gen( + "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"] + ) + + backend = RuntimeEndpoint(base_url) + sgl.set_default_backend(backend) + + tic = time.perf_counter() + states = few_shot_gsm8k.run_batch( + arguments, temperature=0, num_threads=parallel, progress_bar=True + ) + latency = time.perf_counter() - tic + + preds = [get_answer_value(states[i]["answer"]) for i in range(len(states))] + acc = np.mean(np.array(preds) == np.array(labels)) + invalid = np.mean(np.array(preds) == INVALID) + + return float(acc), float(invalid), float(latency) + + +class TestDeepSeekR1MXFP4KvFp8EvalMI35x(unittest.TestCase): + """DeepSeek-R1-MXFP4 GSM8K Evaluation with KV Cache FP8 for AMD MI35x.""" + + @classmethod + def setUpClass(cls): + cls.models = get_mxfp4_models() + cls.base_url = DEFAULT_URL_FOR_TEST + cls.num_questions = int(os.environ.get("GSM8K_NUM_QUESTIONS", "200")) + + def test_deepseek_r1_mxfp4_kv_fp8_accuracy(self): + """Test DeepSeek-R1-MXFP4 models with KV cache FP8 on GSM8K.""" + # Check if model exists + model_path = get_model_path() + is_local_path = model_path.startswith("/") + if is_local_path and not os.path.exists(model_path): + print(f"\n⏭️ SKIPPING: Local model not found at {model_path}") + self.skipTest(f"Local model not found at {model_path}") + return + + if is_local_path: + print(f"📁 Using local model: {model_path}") + else: + print(f"📥 Using HuggingFace model: {model_path}") + + all_results = [] + summary = "### DeepSeek-R1-MXFP4 KV FP8 Models (MI35x)\n\n" + summary += "| Model | Variant | TP | Accuracy | Threshold | Status |\n" + summary += "| ----- | ------- | -- | -------- | --------- | ------ |\n" + + for config in self.models: + display_name = config.get_display_name() + with self.subTest(model=display_name): + print(f"\n{'='*60}") + print(f"Testing: {display_name}") + print(f"{'='*60}") + + env = os.environ.copy() + for key, value in config.env_vars.items(): + env[key] = value + + other_args = list(config.other_args) + other_args.extend(["--tp", str(config.tp_size)]) + timeout = config.timeout or DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH + + try: + process = popen_launch_server( + model=config.model_path, + base_url=self.base_url, + timeout=timeout, + other_args=other_args, + env=env, + ) + + try: + acc, invalid, latency = run_gsm8k_benchmark( + self.base_url, num_questions=self.num_questions + ) + passed = acc >= config.accuracy_threshold + status = "✅ PASS" if passed else "❌ FAIL" + print( + f" accuracy={acc:.3f} threshold={config.accuracy_threshold} {status}" + ) + + all_results.append( + { + "model": display_name, + "accuracy": acc, + "passed": passed, + } + ) + summary += f"| {config.model_path} | {config.variant or 'N/A'} | {config.tp_size} | {acc:.3f} | {config.accuracy_threshold} | {status} |\n" + + finally: + kill_process_tree(process.pid) + + except Exception as e: + summary += f"| {config.model_path} | {config.variant or 'N/A'} | {config.tp_size} | N/A | {config.accuracy_threshold} | ❌ ERROR |\n" + all_results.append( + { + "model": display_name, + "accuracy": None, + "passed": False, + "error": str(e), + } + ) + + if is_in_ci(): + write_github_step_summary(summary) + + failed = [r for r in all_results if not r["passed"]] + if failed: + raise AssertionError(f"Failed models: {[r['model'] for r in failed]}") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py new file mode 100644 index 000000000000..a4104cad5ed2 --- /dev/null +++ b/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py @@ -0,0 +1,177 @@ +"""MI35x Nightly performance benchmark for DeepSeek-R1-MXFP4 model with AIter AllReduce Fusion. + +This test benchmarks the DeepSeek-R1-MXFP4 quantized model on MI35x with 8 GPUs +using --enable-aiter-allreduce-fusion. + +The model path can be configured via DEEPSEEK_R1_MXFP4_MODEL_PATH environment variable. + +Registry: nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion suite + +Example usage: + DEEPSEEK_R1_MXFP4_MODEL_PATH=/data2/models/amd-DeepSeek-R1-MXFP4-Preview python -m pytest test_deepseek_r1_mxfp4_ar_fusion_perf_mi35x.py -v +""" + +import os + +# Set HF cache to /data2/models/ for MI35x so HF models download there +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") +import unittest +from typing import List + +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.nightly_bench_utils import BenchmarkResult +from sglang.test.nightly_utils import NightlyBenchmarkRunner +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env + +# Register for AMD CI - DeepSeek-R1-MXFP4 AllReduce Fusion benchmark on MI35x (~300 min) +register_amd_ci( + est_time=18000, + suite="nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4-ar-fusion", + nightly=True, +) + + +def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str: + """Generate a simplified markdown report without traces and cost columns. + + Skips the first result if it's a warmup run (duplicate batch_size). + """ + model_header = results[0].model_path + if results[0].run_name and results[0].run_name != "default": + model_header += f" ({results[0].run_name})" + + gpu_config = os.getenv("GPU_CONFIG", "MI35x") + if gpu_config: + model_header += f" [{gpu_config}]" + + summary = f"### {model_header}\n" + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n" + summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n" + + # Skip first result if it's a warmup (same batch_size as second result) + report_results = ( + results[1:] + if len(results) > 1 and results[0].batch_size == results[1].batch_size + else results + ) + + for result in report_results: + itl = 1 / (result.output_throughput / result.batch_size) * 1000 + summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n" + + return summary + + +# Model path configuration for MI35x DeepSeek-R1-MXFP4 +# Priority: 1) env var, 2) local path, 3) HuggingFace model ID +DEEPSEEK_R1_MXFP4_LOCAL_PATH = "/data2/models/amd-DeepSeek-R1-MXFP4-Preview" +DEEPSEEK_R1_MXFP4_HF_MODEL_ID = "amd/DeepSeek-R1-MXFP4-Preview" +PROFILE_DIR = "performance_profiles_deepseek_r1_mxfp4_ar_fusion_mi35x" + + +def get_model_path() -> str: + """Get effective model path: env var > local path > HF model ID.""" + # Check env var first + env_path = os.environ.get("DEEPSEEK_R1_MXFP4_MODEL_PATH") + if env_path: + return env_path + # Check local path + if os.path.exists(DEEPSEEK_R1_MXFP4_LOCAL_PATH): + return DEEPSEEK_R1_MXFP4_LOCAL_PATH + # Fall back to HF model ID + return DEEPSEEK_R1_MXFP4_HF_MODEL_ID + + +class TestDeepseekR1MXFP4ArFusionPerfMI35x(unittest.TestCase): + """MI35x Nightly performance benchmark for DeepSeek-R1-MXFP4 with AllReduce Fusion. + + Tests the DeepSeek-R1-MXFP4 quantized model on TP=8 with --enable-aiter-allreduce-fusion. + Uses local path if available, otherwise downloads from HuggingFace. + """ + + @classmethod + def setUpClass(cls): + cls.model = get_model_path() + print(f"Using model path: {cls.model}") + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + cls.variants = [ + { + "name": "ar-fusion", + "other_args": [ + "--trust-remote-code", + "--tp", + "8", + "--chunked-prefill-size", + "131072", + "--disable-radix-cache", + "--mem-fraction-static", + "0.85", + "--enable-aiter-allreduce-fusion", + ], + }, + ] + + cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) + cls.runner.setup_profile_directory() + cls.runner.full_report = f"## {cls.__name__}\n" + + def test_bench_one_batch(self): + """Run benchmark across all configured variants.""" + failed_variants = [] + + is_local_path = self.model.startswith("/") + if is_local_path and not os.path.exists(self.model): + print(f"\n⏭️ SKIPPING: Local model not found at {self.model}") + self.runner.full_report += ( + f"\n⏭️ Test skipped: Local model not found at {self.model}\n" + ) + self.runner.write_final_report() + return + + if is_local_path: + print(f"📁 Using local model: {self.model}") + else: + print( + f"📥 Using HuggingFace model: {self.model} (will download if not cached)" + ) + + try: + for variant_config in self.variants: + with self.subTest(variant=variant_config["name"]): + result_tuple = self.runner.run_benchmark_for_model( + model_path=self.model, + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=variant_config["other_args"], + variant=variant_config["name"], + extra_bench_args=["--trust-remote-code"], + enable_profile=False, + ) + results = result_tuple[0] + success = result_tuple[1] + + if not success: + failed_variants.append(variant_config["name"]) + + if results: + self.runner.full_report += ( + generate_simple_markdown_report(results) + "\n" + ) + finally: + self.runner.write_final_report() + + if failed_variants: + raise AssertionError( + f"Benchmark failed for {self.model} with the following variants: " + f"{', '.join(failed_variants)}" + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py b/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py new file mode 100644 index 000000000000..fe77478a2de9 --- /dev/null +++ b/test/registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py @@ -0,0 +1,178 @@ +"""MI35x Nightly performance benchmark for DeepSeek-R1-MXFP4 model with KV Cache FP8. + +This test benchmarks the DeepSeek-R1-MXFP4 quantized model on MI35x with 8 GPUs +using --kv-cache-dtype fp8_e4m3. + +The model path can be configured via DEEPSEEK_R1_MXFP4_MODEL_PATH environment variable. + +Registry: nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8 suite + +Example usage: + DEEPSEEK_R1_MXFP4_MODEL_PATH=/data2/models/amd-DeepSeek-R1-MXFP4-Preview python -m pytest test_deepseek_r1_mxfp4_kv_fp8_perf_mi35x.py -v +""" + +import os + +# Set HF cache to /data2/models/ for MI35x so HF models download there +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") +import unittest +from typing import List + +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.nightly_bench_utils import BenchmarkResult +from sglang.test.nightly_utils import NightlyBenchmarkRunner +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env + +# Register for AMD CI - DeepSeek-R1-MXFP4 KV FP8 benchmark on MI35x (~300 min) +register_amd_ci( + est_time=18000, + suite="nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4-kv-fp8", + nightly=True, +) + + +def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str: + """Generate a simplified markdown report without traces and cost columns. + + Skips the first result if it's a warmup run (duplicate batch_size). + """ + model_header = results[0].model_path + if results[0].run_name and results[0].run_name != "default": + model_header += f" ({results[0].run_name})" + + gpu_config = os.getenv("GPU_CONFIG", "MI35x") + if gpu_config: + model_header += f" [{gpu_config}]" + + summary = f"### {model_header}\n" + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n" + summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n" + + # Skip first result if it's a warmup (same batch_size as second result) + report_results = ( + results[1:] + if len(results) > 1 and results[0].batch_size == results[1].batch_size + else results + ) + + for result in report_results: + itl = 1 / (result.output_throughput / result.batch_size) * 1000 + summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n" + + return summary + + +# Model path configuration for MI35x DeepSeek-R1-MXFP4 +# Priority: 1) env var, 2) local path, 3) HuggingFace model ID +DEEPSEEK_R1_MXFP4_LOCAL_PATH = "/data2/models/amd-DeepSeek-R1-MXFP4-Preview" +DEEPSEEK_R1_MXFP4_HF_MODEL_ID = "amd/DeepSeek-R1-MXFP4-Preview" +PROFILE_DIR = "performance_profiles_deepseek_r1_mxfp4_kv_fp8_mi35x" + + +def get_model_path() -> str: + """Get effective model path: env var > local path > HF model ID.""" + # Check env var first + env_path = os.environ.get("DEEPSEEK_R1_MXFP4_MODEL_PATH") + if env_path: + return env_path + # Check local path + if os.path.exists(DEEPSEEK_R1_MXFP4_LOCAL_PATH): + return DEEPSEEK_R1_MXFP4_LOCAL_PATH + # Fall back to HF model ID + return DEEPSEEK_R1_MXFP4_HF_MODEL_ID + + +class TestDeepseekR1MXFP4KvFp8PerfMI35x(unittest.TestCase): + """MI35x Nightly performance benchmark for DeepSeek-R1-MXFP4 with KV Cache FP8. + + Tests the DeepSeek-R1-MXFP4 quantized model on TP=8 with --kv-cache-dtype fp8_e4m3. + Uses local path if available, otherwise downloads from HuggingFace. + """ + + @classmethod + def setUpClass(cls): + cls.model = get_model_path() + print(f"Using model path: {cls.model}") + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + cls.variants = [ + { + "name": "kv-fp8", + "other_args": [ + "--trust-remote-code", + "--tp", + "8", + "--chunked-prefill-size", + "131072", + "--disable-radix-cache", + "--mem-fraction-static", + "0.85", + "--kv-cache-dtype", + "fp8_e4m3", + ], + }, + ] + + cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) + cls.runner.setup_profile_directory() + cls.runner.full_report = f"## {cls.__name__}\n" + + def test_bench_one_batch(self): + """Run benchmark across all configured variants.""" + failed_variants = [] + + is_local_path = self.model.startswith("/") + if is_local_path and not os.path.exists(self.model): + print(f"\n⏭️ SKIPPING: Local model not found at {self.model}") + self.runner.full_report += ( + f"\n⏭️ Test skipped: Local model not found at {self.model}\n" + ) + self.runner.write_final_report() + return + + if is_local_path: + print(f"📁 Using local model: {self.model}") + else: + print( + f"📥 Using HuggingFace model: {self.model} (will download if not cached)" + ) + + try: + for variant_config in self.variants: + with self.subTest(variant=variant_config["name"]): + result_tuple = self.runner.run_benchmark_for_model( + model_path=self.model, + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=variant_config["other_args"], + variant=variant_config["name"], + extra_bench_args=["--trust-remote-code"], + enable_profile=False, + ) + results = result_tuple[0] + success = result_tuple[1] + + if not success: + failed_variants.append(variant_config["name"]) + + if results: + self.runner.full_report += ( + generate_simple_markdown_report(results) + "\n" + ) + finally: + self.runner.write_final_report() + + if failed_variants: + raise AssertionError( + f"Benchmark failed for {self.model} with the following variants: " + f"{', '.join(failed_variants)}" + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/amd/test_deepseek_v3_basic_kv_fp8.py b/test/registered/amd/test_deepseek_v3_basic_kv_fp8.py new file mode 100644 index 000000000000..601c07cee183 --- /dev/null +++ b/test/registered/amd/test_deepseek_v3_basic_kv_fp8.py @@ -0,0 +1,86 @@ +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.send_one import BenchArgs, send_one_prompt +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_amd_ci, + is_in_ci, + popen_launch_server, + write_github_step_summary, +) + +register_amd_ci( + est_time=1200, suite="nightly-amd-8-gpu-deepseek-v3-kv-fp8", nightly=True +) + +FULL_DEEPSEEK_V3_MODEL_PATH = "deepseek-ai/DeepSeek-V3-0324" + + +class TestDeepseekV3BasicKvFp8(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = FULL_DEEPSEEK_V3_MODEL_PATH + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = [ + "--trust-remote-code", + "--tp", + "8", + "--kv-cache-dtype", + "fp8_e4m3", + "--model-loader-extra-config", + '{"enable_multithread_load": true, "num_threads": 64}', + ] + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH * 5, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_a_gsm8k( + self, + ): # Append an "a" to make this test run first (alphabetically) to warm up the server + args = SimpleNamespace( + num_shots=8, + data_path=None, + num_questions=1400, + parallel=1400, + max_new_tokens=512, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(f"{metrics=}") + + if is_in_ci(): + write_github_step_summary( + f"### test_gsm8k (deepseek-v3 kv-fp8)\n" f'{metrics["accuracy"]=:.3f}\n' + ) + self.assertGreater(metrics["accuracy"], 0.93) + + def test_bs_1_speed(self): + args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048) + acc_length, speed = send_one_prompt(args) + + print(f"{speed=:.2f}") + + if is_in_ci(): + write_github_step_summary( + f"### test_bs_1_speed (deepseek-v3 kv-fp8)\n" f"{speed=:.2f} token/s\n" + ) + if is_in_amd_ci(): + self.assertGreater(speed, 40) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py b/test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py new file mode 100644 index 000000000000..a62eadf7a587 --- /dev/null +++ b/test/registered/amd/test_deepseek_v3_mtp_kv_fp8.py @@ -0,0 +1,116 @@ +import unittest +from types import SimpleNamespace + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.send_one import BenchArgs, send_one_prompt +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + is_in_amd_ci, + is_in_ci, + popen_launch_server, + write_github_step_summary, +) + +register_amd_ci( + est_time=1200, suite="nightly-amd-8-gpu-deepseek-v3-kv-fp8", nightly=True +) + +FULL_DEEPSEEK_V3_MODEL_PATH = "deepseek-ai/DeepSeek-V3-0324" + + +class TestDeepseekV3MTPKvFp8(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = FULL_DEEPSEEK_V3_MODEL_PATH + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = [ + "--tp", + "8", + "--trust-remote-code", + "--kv-cache-dtype", + "fp8_e4m3", + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "4", + "--model-loader-extra-config", + '{"enable_multithread_load": true, "num_threads": 64}', + ] + if not is_in_amd_ci(): + other_args += ["--mem-frac", "0.7"] + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH * 5, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_a_gsm8k( + self, + ): # Append an "a" to make this test run first (alphabetically) to warm up the server + requests.get(self.base_url + "/flush_cache") + + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(f"{metrics=}") + + server_info = requests.get(self.base_url + "/get_server_info") + avg_spec_accept_length = server_info.json()["internal_states"][0][ + "avg_spec_accept_length" + ] + print(f"{avg_spec_accept_length=}") + + if is_in_ci(): + write_github_step_summary( + f"### test_gsm8k (deepseek-v3 mtp kv-fp8)\n" + f'{metrics["accuracy"]=:.3f}\n' + f"{avg_spec_accept_length=:.2f}\n" + ) + self.assertGreater(metrics["accuracy"], 0.93) + if is_in_amd_ci(): + self.assertGreater(avg_spec_accept_length, 2.8) + + def test_bs_1_speed(self): + args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048) + acc_length, speed = send_one_prompt(args) + + print(f"{acc_length=:.2f} {speed=:.2f}") + + if is_in_ci(): + write_github_step_summary( + f"### test_bs_1_speed (deepseek-v3 mtp kv-fp8)\n" + f"{acc_length=:.2f}\n" + f"{speed=:.2f} token/s\n" + ) + if is_in_amd_ci(): + self.assertGreater(acc_length, 2.8) + else: + self.assertGreater(acc_length, 2.9) + if is_in_amd_ci(): + self.assertGreater(speed, 90) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/ops/test_aiter_allreduce_fusion_amd.py b/test/registered/ops/test_aiter_allreduce_fusion_amd.py index 5813ab09348b..3fe3e9b19753 100644 --- a/test/registered/ops/test_aiter_allreduce_fusion_amd.py +++ b/test/registered/ops/test_aiter_allreduce_fusion_amd.py @@ -10,8 +10,7 @@ from sglang.test.ci.ci_register import register_amd_ci -# Dedicated AMD 8-GPU suite for AITER fused allreduce+rmsnorm validation. -register_amd_ci(est_time=240, suite="stage-c-test-aiter-fusion-8-gpu-amd") +register_amd_ci(est_time=240, suite="stage-c-test-large-8-gpu-amd") class TestAiterAllreduceFusionAmd(unittest.TestCase): diff --git a/test/run_suite.py b/test/run_suite.py index 2f45522aa9b0..d4092dd73d41 100644 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -26,7 +26,7 @@ "stage-b-test-large-8-gpu-35x-disaggregation-amd", "stage-b-test-large-1-gpu-amd", "stage-b-test-large-2-gpu-amd", - "stage-c-test-aiter-fusion-8-gpu-amd", + "stage-c-test-large-8-gpu-amd", "stage-c-test-large-8-gpu-amd-mi35x", ], HWBackend.CUDA: [