diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml index 3902971c2de8..e4250fbc10e0 100644 --- a/.github/workflows/nightly-test-amd.yml +++ b/.github/workflows/nightly-test-amd.yml @@ -22,12 +22,17 @@ on: - 'nightly-test-8-gpu-gpt-oss' - 'nightly-test-8-gpu-grok' - 'nightly-test-8-gpu-deepseek-r1' - - 'nightly-test-8-gpu-deepseek-v3-dp' - - 'nightly-test-8-gpu-deepseek-v3-tc' - - 'nightly-test-8-gpu-deepseek-v3-mtp' - 'nightly-perf-8-gpu-grok' - 'nightly-perf-8-gpu-deepseek-v3' - 'nightly-perf-8-gpu-deepseek-v31' + # MI35x jobs + - 'nightly-test-2-gpu-mi35x' + - 'nightly-test-2-gpu-vlm-mi35x' + - 'nightly-test-8-gpu-mi35x-gpt-oss' + - 'nightly-test-8-gpu-mi35x-grok' + - 'nightly-test-8-gpu-mi35x-deepseek-r1' + - 'nightly-perf-8-gpu-mi35x-grok' + - 'nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4' workflow_call: inputs: ref: @@ -68,7 +73,9 @@ jobs: - name: Nightly Test (2-GPU) run: | - bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd --timeout-per-file 7200 || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -95,7 +102,9 @@ jobs: - name: Nightly Test (2-GPU VLM MMMU) timeout-minutes: 180 run: | - bash scripts/ci/amd_ci_exec.sh -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-vlm --timeout-per-file 7200 || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-vlm --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -121,7 +130,10 @@ jobs: - name: Nightly Test (8-GPU GPT-OSS) run: | - bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=gpt-oss -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ + -e AMD_TEST_MODEL_GROUP=gpt-oss \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -147,11 +159,14 @@ jobs: - name: Nightly Test (8-GPU GROK) run: | - bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=grok -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ + -e AMD_TEST_MODEL_GROUP=grok \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU tests (TP=8) - DeepSeek-R1 (reasoning model) + # 8-GPU tests (TP=8) - DeepSeek-R1 all variants (basic, MTP, DP, TC) nightly-test-8-gpu-deepseek-r1: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-deepseek-r1') runs-on: linux-mi325-gpu-8 @@ -171,15 +186,18 @@ jobs: - name: Install dependencies run: bash scripts/ci/amd_ci_install_dependency.sh - - name: Nightly Test (8-GPU DeepSeek-R1) + - name: Nightly Test (8-GPU DeepSeek-R1 all variants) run: | - bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=deepseek-r1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ + -e AMD_TEST_MODEL_GROUP=deepseek-r1-all \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU tests (TP=8) - DeepSeek-V3 + DP Attention (requires ROCm 7.0+) - nightly-test-8-gpu-deepseek-v3-dp: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-deepseek-v3-dp') + # 8-GPU Performance Tests (TP=8) - Grok (Grok-1 + Grok-2) performance benchmarks + nightly-perf-8-gpu-grok: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-grok') runs-on: linux-mi325-gpu-8 steps: - name: Checkout code @@ -197,15 +215,16 @@ jobs: - name: Install dependencies run: bash scripts/ci/amd_ci_install_dependency.sh - - name: Nightly Test (8-GPU DeepSeek-V3 + DP Attention) + - name: Nightly Perf Test (8-GPU Grok-1 + Grok-2) + timeout-minutes: 60 run: | - bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=deepseek-v3-dp -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e RCCL_MSCCL_ENABLE=0 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_grok_perf.py || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU tests (TP=8) - DeepSeek-V3 + Torch Compile (requires ROCm 7.0+) - nightly-test-8-gpu-deepseek-v3-tc: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-deepseek-v3-tc') + # 8-GPU Performance Tests (TP=8) - DeepSeek-V3 performance benchmarks + nightly-perf-8-gpu-deepseek-v3: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-deepseek-v3') runs-on: linux-mi325-gpu-8 steps: - name: Checkout code @@ -223,15 +242,16 @@ jobs: - name: Install dependencies run: bash scripts/ci/amd_ci_install_dependency.sh - - name: Nightly Test (8-GPU DeepSeek-V3 + Torch Compile) + - name: Nightly Perf Test (8-GPU DeepSeek-V3) + timeout-minutes: 300 run: | - bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=deepseek-v3-tc -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e SGLANG_USE_ROCM700A=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_deepseek_v3_perf.py || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU tests (TP=8) - DeepSeek-V3 + MTP/EAGLE (requires ROCm 7.0+) - nightly-test-8-gpu-deepseek-v3-mtp: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-deepseek-v3-mtp') + # 8-GPU Performance Tests (TP=8) - DeepSeek-V3.1 performance benchmarks + nightly-perf-8-gpu-deepseek-v31: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-deepseek-v31') runs-on: linux-mi325-gpu-8 steps: - name: Checkout code @@ -249,16 +269,18 @@ jobs: - name: Install dependencies run: bash scripts/ci/amd_ci_install_dependency.sh - - name: Nightly Test (8-GPU DeepSeek-V3 + MTP) + - name: Nightly Perf Test (8-GPU DeepSeek-V3.1) + timeout-minutes: 300 run: | - bash scripts/ci/amd_ci_exec.sh -e AMD_TEST_MODEL_GROUP=deepseek-v3-mtp -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 run_suite.py --suite nightly-amd-8-gpu --timeout-per-file 7200 || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e SGLANG_USE_ROCM700A=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_deepseek_v31_perf.py || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU Performance Tests (TP=8) - Grok (Grok-1 + Grok-2) performance benchmarks - nightly-perf-8-gpu-grok: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-grok') - runs-on: linux-mi325-gpu-8 + # ============================================== MI35x Tests ============================================== + # MI35x 2-GPU tests (TP=2) - Reuses nightly-amd suite + nightly-test-2-gpu-mi35x: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-2-gpu-mi35x') + runs-on: linux-mi35x-gpu-2 steps: - name: Checkout code uses: actions/checkout@v4 @@ -273,19 +295,23 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd_ci_install_dependency.sh + run: | + bash scripts/ci/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd_ci_exec.sh pip install tabulate - - name: Nightly Perf Test (8-GPU Grok-1 + Grok-2) - timeout-minutes: 60 + - name: Nightly Test (2-GPU) run: | - bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e RCCL_MSCCL_ENABLE=0 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_grok_perf.py || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU Performance Tests (TP=8) - DeepSeek-V3 performance benchmarks - nightly-perf-8-gpu-deepseek-v3: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-deepseek-v3') - runs-on: linux-mi325-gpu-8 + # MI35x 2-GPU VLM tests - Reuses nightly-amd-vlm suite + nightly-test-2-gpu-vlm-mi35x: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-2-gpu-vlm-mi35x') + runs-on: linux-mi35x-gpu-2 steps: - name: Checkout code uses: actions/checkout@v4 @@ -300,19 +326,24 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd_ci_install_dependency.sh + run: | + bash scripts/ci/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd_ci_exec.sh pip install tabulate - - name: Nightly Perf Test (8-GPU DeepSeek-V3) - timeout-minutes: 300 + - name: Nightly Test (2-GPU VLM MMMU) + timeout-minutes: 180 run: | - bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e SGLANG_USE_ROCM700A=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_deepseek_v3_perf.py || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-vlm --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} - # 8-GPU Performance Tests (TP=8) - DeepSeek-V3.1 performance benchmarks - nightly-perf-8-gpu-deepseek-v31: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-deepseek-v31') - runs-on: linux-mi325-gpu-8 + # MI35x 8-GPU tests (TP=8) - GPT-OSS models (MI35x uses openai/* paths) + nightly-test-8-gpu-mi35x-gpt-oss: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-mi35x-gpt-oss') + runs-on: linux-mi35x-gpu-8 steps: - name: Checkout code uses: actions/checkout@v4 @@ -327,29 +358,168 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd_ci_install_dependency.sh + run: | + bash scripts/ci/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd_ci_exec.sh pip install tabulate - - name: Nightly Perf Test (8-GPU DeepSeek-V3.1) + - name: Nightly Test MI35x (8-GPU GPT-OSS) + timeout-minutes: 180 + run: | + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ + -e AMD_TEST_MODEL_GROUP=gpt-oss \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU tests (TP=8) - GROK models + nightly-test-8-gpu-mi35x-grok: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-mi35x-grok') + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd_ci_exec.sh pip install tabulate + + - name: Nightly Test MI35x (8-GPU GROK) + run: | + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ + -e AMD_TEST_MODEL_GROUP=grok \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU tests (TP=8) - DeepSeek-R1-0528 basic + MTP only + # Same model as MI300X for consistency; MXFP4 only used for perf tests + # Note: DP/TC variants disabled for MI35x due to initialization timeouts + nightly-test-8-gpu-mi35x-deepseek-r1: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-8-gpu-mi35x-deepseek-r1') + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd_ci_exec.sh pip install tabulate + + - name: Nightly Test MI35x (8-GPU DeepSeek-R1-0528 basic + MTP) + timeout-minutes: 180 + run: | + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ + -e AMD_TEST_MODEL_GROUP=deepseek-r1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Performance Tests (TP=8) - Grok performance benchmarks + nightly-perf-8-gpu-mi35x-grok: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-grok') + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd_ci_exec.sh pip install tabulate + + - name: Nightly Perf Test MI35x (8-GPU Grok) + timeout-minutes: 60 + run: | + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e RCCL_MSCCL_ENABLE=0 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_grok_perf.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # MI35x 8-GPU Performance Tests (TP=8) - DeepSeek-R1-MXFP4 performance benchmarks + nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4') + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.ref || github.ref }} + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd_ci_install_dependency.sh + # Install tabulate for run_suite.py (missing in MI35x container) + bash scripts/ci/amd_ci_exec.sh pip install tabulate + + - name: Nightly Perf Test MI35x (8-GPU DeepSeek-R1-MXFP4) timeout-minutes: 300 run: | - bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e SGLANG_USE_ROCM700A=1 -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_deepseek_v31_perf.py || TEST_EXIT_CODE=$? + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" python3 registered/amd/test_deepseek_r1_mxfp4_perf.py || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} check-all-jobs: if: always() && (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch') needs: + # MI325 jobs - nightly-test-2-gpu - nightly-test-2-gpu-vlm - nightly-test-8-gpu-gpt-oss - nightly-test-8-gpu-grok - - nightly-test-8-gpu-deepseek-v3-dp - - nightly-test-8-gpu-deepseek-v3-tc - - nightly-test-8-gpu-deepseek-v3-mtp - nightly-test-8-gpu-deepseek-r1 - nightly-perf-8-gpu-grok - nightly-perf-8-gpu-deepseek-v3 - nightly-perf-8-gpu-deepseek-v31 + # MI35x jobs + - nightly-test-2-gpu-mi35x + - nightly-test-2-gpu-vlm-mi35x + - nightly-test-8-gpu-mi35x-gpt-oss + - nightly-test-8-gpu-mi35x-grok + - nightly-test-8-gpu-mi35x-deepseek-r1 + - nightly-perf-8-gpu-mi35x-grok + - nightly-perf-8-gpu-mi35x-deepseek-r1-mxfp4 runs-on: ubuntu-latest steps: - name: Check if any job failed diff --git a/test/srt/nightly/test_gsm8k_completion_eval_amd.py b/test/registered/amd/nightly/test_gsm8k_completion_eval_amd.py similarity index 91% rename from test/srt/nightly/test_gsm8k_completion_eval_amd.py rename to test/registered/amd/nightly/test_gsm8k_completion_eval_amd.py index bf5a55f853f8..b44eb04efc5f 100644 --- a/test/srt/nightly/test_gsm8k_completion_eval_amd.py +++ b/test/registered/amd/nightly/test_gsm8k_completion_eval_amd.py @@ -1,5 +1,5 @@ """ -AMD GSM8K Completion Evaluation Test +AMD GSM8K Completion Evaluation Test (Migrated from test/srt/nightly/) This test uses the completion-based gsm8k benchmark (few-shot prompting) which works with base models that don't have chat templates. @@ -20,6 +20,8 @@ - "deepseek-v3-mtp": DeepSeek-V3 with MTP/EAGLE (nightly-amd-8-gpu-deepseek-v3-mtp) - "deepseek-r1": DeepSeek-R1 reasoning model (nightly-amd-8-gpu-deepseek-r1) - "all": All models + +Registry: nightly-amd-8-gpu suite (8-GPU tests) """ import ast @@ -44,6 +46,7 @@ print("[WARNING] huggingface_hub not available - model cache checking disabled") from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, @@ -53,6 +56,9 @@ ) from sglang.utils import download_and_cache_file, read_jsonl +# Register for AMD CI - GSM8K completion tests (~120 min) +register_amd_ci(est_time=7200, suite="nightly-amd-8-gpu", nightly=True) + INVALID = -9999999 @@ -67,6 +73,9 @@ class BaseModelConfig: env_vars: Optional[dict] = None tokenizer_path: Optional[str] = None timeout: Optional[int] = None # Custom timeout for server launch (seconds) + variant: Optional[str] = ( + None # Test variant name (e.g., "basic", "MTP", "DP", "TC") + ) def __post_init__(self): if self.other_args is None: @@ -74,6 +83,12 @@ def __post_init__(self): if self.env_vars is None: self.env_vars = {} + def get_display_name(self) -> str: + """Return display name for logs/summary (model + variant if set).""" + if self.variant: + return f"{self.model_path} ({self.variant})" + return self.model_path + # ============================================================================= # MODEL GROUPS - Each group runs on a separate 8-GPU runner @@ -193,84 +208,80 @@ def __post_init__(self): ), ] -# Group 3: DeepSeek-V3 with DP Attention -# Runner: nightly-amd-8-gpu-deepseek-v3-dp -# Note: Uses DP attention (dp-size=8) for better performance, requires ROCm 7.0+ -AMD_DEEPSEEK_V3_DP_MODELS = [ - # DeepSeek-V3-0324 with DP attention +# Note: DeepSeek-V3 accuracy tests removed - V3 only used for perf tests +# See test_deepseek_v3_perf.py and test_deepseek_v31_perf.py for V3 perf tests + +# Group 3: DeepSeek-R1 (reasoning model) - Basic + MTP combined +# Runner: nightly-amd-8-gpu-deepseek-r1 +AMD_DEEPSEEK_R1_MODELS = [ + # DeepSeek-R1-0528 basic - reasoning model, ~80GB per GPU BaseModelConfig( - model_path="deepseek-ai/DeepSeek-V3-0324", + model_path="deepseek-ai/DeepSeek-R1-0528", tp_size=8, accuracy_threshold=0.93, timeout=3600, # 1 hour for large model + variant="basic", other_args=[ + "--attention-backend", + "aiter", "--chunked-prefill-size", "131072", - "--dp-size", - "8", - "--enable-dp-attention", + "--disable-radix-cache", "--mem-fraction-static", "0.85", "--trust-remote-code", ], env_vars={ - "SGLANG_USE_ROCM700A": "1", "SGLANG_USE_AITER": "1", }, ), -] - -# Group 3b: DeepSeek-V3 with Torch Compile -# Runner: nightly-amd-8-gpu-deepseek-v3-tc -# Note: Uses torch compile for performance optimization, requires ROCm 7.0+ -AMD_DEEPSEEK_V3_TC_MODELS = [ - # DeepSeek-V3-0324 with torch compile + # DeepSeek-R1-0528 with MTP (EAGLE speculative decoding) BaseModelConfig( - model_path="deepseek-ai/DeepSeek-V3-0324", + model_path="deepseek-ai/DeepSeek-R1-0528", tp_size=8, accuracy_threshold=0.93, - timeout=7200, # 2 hours for compilation + large model + timeout=3600, + variant="MTP", other_args=[ "--chunked-prefill-size", "131072", + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "4", "--mem-fraction-static", - "0.70", # Reduced further for torch compile - "--cuda-graph-max-bs", - "8", # Reduced from 16 to reduce memory - "--enable-torch-compile", - "--disable-cuda-graph", # Disable cuda graph to avoid memory issues + "0.7", "--trust-remote-code", ], env_vars={ - "SGLANG_USE_ROCM700A": "1", "SGLANG_USE_AITER": "1", }, ), ] -# Group 3c: DeepSeek-V3 with MTP (EAGLE speculative decoding) -# Runner: nightly-amd-8-gpu-deepseek-v3-mtp -# Note: Uses MTP for improved throughput, requires ROCm 7.0+ -AMD_DEEPSEEK_V3_MTP_MODELS = [ - # DeepSeek-V3-0324 with MTP (EAGLE speculative decoding) +# Group 5: DeepSeek-R1 with DP + TC combined +# Runner: nightly-amd-8-gpu-deepseek-r1-dp-tc +# Combines DP attention and Torch Compile tests for DeepSeek-R1 +AMD_DEEPSEEK_R1_DP_TC_MODELS = [ + # DeepSeek-R1-0528 with DP attention BaseModelConfig( - model_path="deepseek-ai/DeepSeek-V3-0324", + model_path="deepseek-ai/DeepSeek-R1-0528", tp_size=8, accuracy_threshold=0.93, - timeout=3600, # 1 hour for large model + timeout=3600, + variant="DP", other_args=[ "--chunked-prefill-size", "131072", - "--speculative-algorithm", - "EAGLE", - "--speculative-num-steps", - "3", - "--speculative-eagle-topk", - "1", - "--speculative-num-draft-tokens", - "4", + "--dp-size", + "8", + "--enable-dp-attention", "--mem-fraction-static", - "0.7", + "0.85", "--trust-remote-code", ], env_vars={ @@ -278,28 +289,26 @@ def __post_init__(self): "SGLANG_USE_AITER": "1", }, ), -] - -# Group 4: DeepSeek-R1 (reasoning model) -# Runner: nightly-amd-8-gpu-deepseek-r1 -AMD_DEEPSEEK_R1_MODELS = [ - # DeepSeek-R1-0528 - reasoning model, ~80GB per GPU + # DeepSeek-R1-0528 with torch compile BaseModelConfig( model_path="deepseek-ai/DeepSeek-R1-0528", tp_size=8, accuracy_threshold=0.93, - timeout=3600, # 1 hour for large model + timeout=7200, # 2 hours for compilation + variant="TC", other_args=[ - "--attention-backend", - "aiter", "--chunked-prefill-size", "131072", - "--disable-radix-cache", "--mem-fraction-static", - "0.85", + "0.70", + "--cuda-graph-max-bs", + "8", + "--enable-torch-compile", + "--disable-cuda-graph", "--trust-remote-code", ], env_vars={ + "SGLANG_USE_ROCM700A": "1", "SGLANG_USE_AITER": "1", }, ), @@ -312,27 +321,28 @@ def get_model_group() -> str: def get_models_for_group(group: str) -> List[BaseModelConfig]: - """Get the list of models for a given group.""" + """Get the list of models for a given group. + + Note: DeepSeek-V3 is only used for perf tests, not accuracy tests. + See test_deepseek_v3_perf.py and test_deepseek_v31_perf.py. + """ if group == "gpt-oss": return AMD_GPT_OSS_MODELS elif group == "grok": return AMD_GROK_MODELS - elif group == "deepseek-v3-dp": - return AMD_DEEPSEEK_V3_DP_MODELS - elif group == "deepseek-v3-tc": - return AMD_DEEPSEEK_V3_TC_MODELS - elif group == "deepseek-v3-mtp": - return AMD_DEEPSEEK_V3_MTP_MODELS elif group == "deepseek-r1": return AMD_DEEPSEEK_R1_MODELS + elif group == "deepseek-r1-dp-tc": + return AMD_DEEPSEEK_R1_DP_TC_MODELS + elif group == "deepseek-r1-all": + # All DeepSeek-R1 variants: basic, MTP, DP, TC + return AMD_DEEPSEEK_R1_MODELS + AMD_DEEPSEEK_R1_DP_TC_MODELS elif group == "all": return ( AMD_GPT_OSS_MODELS + AMD_GROK_MODELS - + AMD_DEEPSEEK_V3_DP_MODELS - + AMD_DEEPSEEK_V3_TC_MODELS - + AMD_DEEPSEEK_V3_MTP_MODELS + AMD_DEEPSEEK_R1_MODELS + + AMD_DEEPSEEK_R1_DP_TC_MODELS ) else: print(f"[WARNING] Unknown model group '{group}', using 'gpt-oss'") @@ -671,9 +681,10 @@ def test_gsm8k_completion_all_models(self): ) for config in self.models: - with self.subTest(model=config.model_path): + display_name = config.get_display_name() + with self.subTest(model=display_name): print(f"\n{'='*60}") - print(f"Testing: {config.model_path} (TP={config.tp_size})") + print(f"Testing: {display_name} (TP={config.tp_size})") print(f"{'='*60}") error_message = None @@ -687,12 +698,12 @@ def test_gsm8k_completion_all_models(self): if not is_available: print(f"\n❌ MODEL NOT AVAILABLE: {status_msg}") - print(f"⏭️ SKIPPING: {config.model_path}") + print(f"⏭️ SKIPPING: {display_name}") status = f"⏭️ SKIP" skipped = True all_results.append( { - "model": config.model_path, + "model": display_name, "tp_size": config.tp_size, "accuracy": None, "threshold": config.accuracy_threshold, @@ -709,7 +720,7 @@ def test_gsm8k_completion_all_models(self): else: try: # Launch server with timing - print(f"\n🚀 Launching server for {config.model_path}...") + print(f"\n🚀 Launching server for {display_name}...") server_start = time.time() process = popen_launch_server_for_base_model( self.base_url, config @@ -747,7 +758,7 @@ def test_gsm8k_completion_all_models(self): total_time = time.time() - model_start - print(f"\n📈 Results for {config.model_path}:") + print(f"\n📈 Results for {display_name}:") print( f" Accuracy: {acc:.3f} (threshold: {config.accuracy_threshold})" ) @@ -768,7 +779,7 @@ def test_gsm8k_completion_all_models(self): all_results.append( { - "model": config.model_path, + "model": display_name, "tp_size": config.tp_size, "accuracy": acc, "threshold": config.accuracy_threshold, @@ -790,7 +801,7 @@ def test_gsm8k_completion_all_models(self): status = "❌ ERROR" all_results.append( { - "model": config.model_path, + "model": display_name, "tp_size": config.tp_size, "accuracy": None, "threshold": config.accuracy_threshold, @@ -806,7 +817,7 @@ def test_gsm8k_completion_all_models(self): ) finally: - print(f"\n🛑 Stopping server for {config.model_path}...") + print(f"\n🛑 Stopping server for {display_name}...") kill_process_tree(process.pid) except Exception as e: @@ -816,7 +827,7 @@ def test_gsm8k_completion_all_models(self): status = "❌ ERROR" all_results.append( { - "model": config.model_path, + "model": display_name, "tp_size": config.tp_size, "accuracy": None, "threshold": config.accuracy_threshold, @@ -831,14 +842,14 @@ def test_gsm8k_completion_all_models(self): } ) - # Add to summary with runtime + # Add to summary with runtime (use display name to show variant) acc_str = f"{acc:.3f}" if acc is not None else "N/A" startup_str = ( f"{startup_time:.0f}s" if startup_time is not None else "N/A" ) bench_str = f"{bench_time:.0f}s" if bench_time is not None else "N/A" total_str = f"{total_time:.0f}s" if total_time is not None else "N/A" - summary += f"| {config.model_path} | {config.tp_size} | {acc_str} | {config.accuracy_threshold} | {startup_str} | {bench_str} | {total_str} | {status} |\n" + summary += f"| {display_name} | {config.tp_size} | {acc_str} | {config.accuracy_threshold} | {startup_str} | {bench_str} | {total_str} | {status} |\n" # Calculate total test runtime total_test_time = time.time() - total_test_start diff --git a/test/registered/amd/nightly/test_gsm8k_completion_eval_mi35x.py b/test/registered/amd/nightly/test_gsm8k_completion_eval_mi35x.py new file mode 100644 index 000000000000..87a95c023dd7 --- /dev/null +++ b/test/registered/amd/nightly/test_gsm8k_completion_eval_mi35x.py @@ -0,0 +1,726 @@ +""" +MI35x GSM8K Completion Evaluation Test (8-GPU) + +This test uses the completion-based gsm8k benchmark (few-shot prompting) +for MI35x-specific models that differ from MI300X configurations. + +MI35x-specific models: +- GPT-OSS series: Uses openai/gpt-oss-* (not lmsys/gpt-oss-*-bf16) +- DeepSeek-R1-0528: Same model as MI300X (MXFP4 only used for perf tests) + +Model groups are selected via AMD_TEST_MODEL_GROUP environment variable: +- "gpt-oss" (default): GPT-OSS models with MI35x paths +- "deepseek-r1": DeepSeek-R1-0528 basic + MTP (same as MI300X) +- "deepseek-r1-dp-tc": DeepSeek-R1-0528 DP + TC (same as MI300X) +- "deepseek-r1-all": All DeepSeek-R1-0528 variants (basic, MTP, DP, TC) + +Registry: nightly-amd-8-gpu-mi35x suite (8-GPU tests on MI35x) +""" + +import ast +import os + +# Set HF cache to /data2/models/ for MI35x so HF models download there +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") +import re +import subprocess +import time +import unittest +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import numpy as np + +# HuggingFace Hub for model cache checking and download progress +try: + from huggingface_hub import HfFileSystem + from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError + + HF_HUB_AVAILABLE = True +except ImportError: + HF_HUB_AVAILABLE = False + print("[WARNING] huggingface_hub not available - model cache checking disabled") + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + is_in_ci, + popen_launch_server, + write_github_step_summary, +) +from sglang.utils import download_and_cache_file, read_jsonl + +# Register for AMD CI - MI35x 8-GPU GSM8K completion tests (~120 min) +register_amd_ci(est_time=7200, suite="nightly-amd-8-gpu-mi35x", nightly=True) + +INVALID = -9999999 + + +@dataclass +class BaseModelConfig: + """Configuration for a base model to test.""" + + model_path: str # HuggingFace model ID (e.g., "amd/DeepSeek-R1-MXFP4-Preview") + tp_size: int = 8 + accuracy_threshold: float = 0.50 + other_args: Optional[List[str]] = None + env_vars: Optional[dict] = None + tokenizer_path: Optional[str] = None + timeout: Optional[int] = None + local_path: Optional[str] = None # Preferred local path (checked first before HF) + variant: Optional[str] = ( + None # Test variant name (e.g., "basic", "MTP", "DP", "TC") + ) + + def __post_init__(self): + if self.other_args is None: + self.other_args = [] + if self.env_vars is None: + self.env_vars = {} + + def get_effective_model_path(self) -> str: + """Return local_path if it exists, otherwise model_path (HF ID).""" + if self.local_path and os.path.exists(self.local_path): + return self.local_path + return self.model_path + + def get_display_name(self) -> str: + """Return display name for logs/summary (model + variant if set).""" + if self.variant: + return f"{self.model_path} ({self.variant})" + return self.model_path + + +# ============================================================================= +# MI35x MODEL GROUPS - Different from MI300X configurations +# ============================================================================= + +# Group 1: GPT-OSS models (MI35x uses openai/* paths, not lmsys/*) +MI35X_GPT_OSS_MODELS = [ + # GPT-OSS-20B - MI35x specific path + BaseModelConfig( + model_path="openai/gpt-oss-20b", + tp_size=8, + accuracy_threshold=0.47, + other_args=[ + "--chunked-prefill-size", + "130172", + "--max-running-requests", + "128", + "--mem-fraction-static", + "0.85", + "--attention-backend", + "triton", + "--trust-remote-code", + ], + env_vars={"SGLANG_USE_AITER": "1"}, + ), + # GPT-OSS-120B - MI35x specific path + BaseModelConfig( + model_path="openai/gpt-oss-120b", + tp_size=8, + accuracy_threshold=0.79, + timeout=900, # 15 minutes for 120B model + other_args=[ + "--chunked-prefill-size", + "130172", + "--max-running-requests", + "128", + "--mem-fraction-static", + "0.85", + "--attention-backend", + "triton", + "--trust-remote-code", + ], + env_vars={"SGLANG_USE_AITER": "1"}, + ), +] + +# Group 2: DeepSeek-R1-0528 basic + MTP (same model as MI300X for consistency) +# Runner: nightly-test-8-gpu-mi35x-deepseek-r1 +# Note: MXFP4 variant only used for perf tests (test_deepseek_r1_mxfp4_perf.py) +MI35X_DEEPSEEK_R1_MODELS = [ + # DeepSeek-R1-0528 basic - reasoning model, ~80GB per GPU + BaseModelConfig( + model_path="deepseek-ai/DeepSeek-R1-0528", + tp_size=8, + accuracy_threshold=0.93, + timeout=3600, # 1 hour for large model + variant="basic", + other_args=[ + "--attention-backend", + "aiter", + "--chunked-prefill-size", + "131072", + "--disable-radix-cache", + "--mem-fraction-static", + "0.85", + "--trust-remote-code", + ], + env_vars={ + "SGLANG_USE_AITER": "1", + }, + ), + # DeepSeek-R1-0528 with MTP (EAGLE speculative decoding) + BaseModelConfig( + model_path="deepseek-ai/DeepSeek-R1-0528", + tp_size=8, + accuracy_threshold=0.93, + timeout=3600, + variant="MTP", + other_args=[ + "--chunked-prefill-size", + "131072", + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "3", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "4", + "--mem-fraction-static", + "0.7", + "--trust-remote-code", + ], + env_vars={ + "SGLANG_USE_AITER": "1", + }, + ), +] + +# Group 3: DeepSeek-R1-0528 with DP + TC (requires ROCm 7.0+) +# Runner: nightly-test-8-gpu-mi35x-deepseek-r1-dp-tc +MI35X_DEEPSEEK_R1_DP_TC_MODELS = [ + # DeepSeek-R1-0528 with DP attention + BaseModelConfig( + model_path="deepseek-ai/DeepSeek-R1-0528", + tp_size=8, + accuracy_threshold=0.93, + timeout=3600, + variant="DP", + other_args=[ + "--chunked-prefill-size", + "131072", + "--dp-size", + "8", + "--enable-dp-attention", + "--mem-fraction-static", + "0.85", + "--trust-remote-code", + ], + env_vars={ + "SGLANG_USE_ROCM700A": "1", + "SGLANG_USE_AITER": "1", + }, + ), + # DeepSeek-R1-0528 with torch compile + BaseModelConfig( + model_path="deepseek-ai/DeepSeek-R1-0528", + tp_size=8, + accuracy_threshold=0.93, + timeout=7200, # 2 hours for compilation + variant="TC", + other_args=[ + "--chunked-prefill-size", + "131072", + "--mem-fraction-static", + "0.70", + "--cuda-graph-max-bs", + "8", + "--enable-torch-compile", + "--disable-cuda-graph", + "--trust-remote-code", + ], + env_vars={ + "SGLANG_USE_ROCM700A": "1", + "SGLANG_USE_AITER": "1", + }, + ), +] + + +def get_model_group() -> str: + """Get the model group to test from environment variable.""" + return os.environ.get("AMD_TEST_MODEL_GROUP", "gpt-oss") + + +def get_models_for_group(group: str) -> List[BaseModelConfig]: + """Get the list of models for a given group. + + Note: DeepSeek-R1-MXFP4 is only used for perf tests, not accuracy tests. + See test_deepseek_r1_mxfp4_perf.py for MXFP4 perf tests. + """ + if group == "gpt-oss": + return MI35X_GPT_OSS_MODELS + elif group == "deepseek-r1": + return MI35X_DEEPSEEK_R1_MODELS + elif group == "deepseek-r1-dp-tc": + return MI35X_DEEPSEEK_R1_DP_TC_MODELS + elif group == "deepseek-r1-all": + # All DeepSeek-R1-0528 variants: basic, MTP, DP, TC + return MI35X_DEEPSEEK_R1_MODELS + MI35X_DEEPSEEK_R1_DP_TC_MODELS + elif group == "all": + return ( + MI35X_GPT_OSS_MODELS + + MI35X_DEEPSEEK_R1_MODELS + + MI35X_DEEPSEEK_R1_DP_TC_MODELS + ) + else: + print(f"[WARNING] Unknown model group '{group}', using 'gpt-oss'") + return MI35X_GPT_OSS_MODELS + + +# ============================================================================= +# MODEL CACHE AND DOWNLOAD UTILITIES +# ============================================================================= + + +def check_local_cache(model_path: str) -> Tuple[bool, str]: + """ + Check if model is cached locally. + + Returns: + Tuple of (is_cached, cache_path_or_message) + """ + # Check common HF cache locations for MI35x + cache_dirs = [ + os.path.expanduser("~/.cache/huggingface/hub"), + "/data2/models/huggingface/hub", + os.environ.get("HF_HUB_CACHE", ""), + ] + cache_dirs = [d for d in cache_dirs if d] # Remove empty + + # Convert model_path to cache directory format (org--model) + cache_name = f"models--{model_path.replace('/', '--')}" + + for cache_dir in cache_dirs: + cache_path = os.path.join(cache_dir, cache_name) + if os.path.exists(cache_path): + # Check if there are snapshots + snapshots_dir = os.path.join(cache_path, "snapshots") + if os.path.exists(snapshots_dir) and os.listdir(snapshots_dir): + return True, cache_path + + return False, f"Not found in: {', '.join(cache_dirs)}" + + +def check_hf_repo_access(model_path: str) -> Tuple[bool, str]: + """ + Check if HuggingFace repository is accessible. + + Returns: + Tuple of (is_accessible, message) + """ + if not HF_HUB_AVAILABLE: + return True, "huggingface_hub not available, skipping access check" + + try: + fs = HfFileSystem() + # Try to list files in the repo + files = fs.ls(model_path, detail=False) + if files: + return True, f"Repository accessible ({len(files)} files)" + else: + return False, "Repository exists but is empty" + except GatedRepoError: + return False, "GATED REPO - requires authentication/approval" + except RepositoryNotFoundError: + return False, "REPO NOT FOUND on HuggingFace" + except Exception as e: + error_msg = str(e) + if "401" in error_msg or "unauthorized" in error_msg.lower(): + return False, f"AUTH ERROR - may need HF_TOKEN: {error_msg[:100]}" + elif "404" in error_msg: + return False, f"NOT FOUND: {error_msg[:100]}" + elif "timeout" in error_msg.lower() or "connection" in error_msg.lower(): + return False, f"NETWORK ERROR: {error_msg[:100]}" + else: + return False, f"ERROR: {error_msg[:100]}" + + +def log_model_status(config: "BaseModelConfig") -> Tuple[bool, str]: + """ + Log detailed model availability status. + + Checks in order: + 1. local_path (if specified) - preferred local path + 2. model_path as local path (if starts with /) + 3. model_path as HF model ID - check cache then HF access + + Returns: + Tuple of (is_available, status_message) + """ + model_path = config.model_path + local_path = config.local_path + + print(f"\n📦 Checking model: {model_path}") + if local_path: + print(f" (preferred local: {local_path})") + print("-" * 50) + + # Step 1: Check preferred local_path first (if specified) + if local_path: + if os.path.exists(local_path): + print(f" ✅ LOCAL PATH: Found at {local_path}") + return True, f"Local path exists at {local_path}" + else: + print(f" ⚠️ LOCAL PATH: Not found at {local_path}, trying HF fallback...") + + # Step 2: For absolute paths (starting with /), check if exists + if model_path.startswith("/"): + if os.path.exists(model_path): + print(f" ✅ LOCAL PATH: Found at {model_path}") + return True, f"Local path exists at {model_path}" + else: + print(f" ❌ LOCAL PATH: Not found at {model_path}") + return False, f"Local path not found at {model_path}" + + # Step 3: For HF model IDs, check local cache first + is_cached, cache_msg = check_local_cache(model_path) + if is_cached: + print(f" ✅ LOCAL CACHE: Found at {cache_msg}") + return True, f"Cached locally at {cache_msg}" + else: + print(f" ⚠️ LOCAL CACHE: {cache_msg}") + + # Step 4: Check HF repo access (will download if accessible) + is_accessible, access_msg = check_hf_repo_access(model_path) + if is_accessible: + print(f" ✅ HF ACCESS: {access_msg}") + print( + f" 📥 Model will be downloaded from HuggingFace to {os.environ.get('HF_HOME', '~/.cache/huggingface')}" + ) + return True, f"Will download from HF: {access_msg}" + else: + print(f" ❌ HF ACCESS: {access_msg}") + return False, access_msg + + +# ============================================================================= +# BENCHMARK UTILITIES +# ============================================================================= + + +def get_one_example(lines, i, include_answer): + """Format a single GSM8K example.""" + ret = "Question: " + lines[i]["question"] + "\nAnswer:" + if include_answer: + ret += " " + lines[i]["answer"] + return ret + + +def get_few_shot_examples(lines, k): + """Get k few-shot examples for prompting.""" + ret = "" + for i in range(k): + ret += get_one_example(lines, i, True) + "\n\n" + return ret + + +def get_answer_value(answer_str): + """Extract numerical answer from response.""" + answer_str = answer_str.replace(",", "") + numbers = re.findall(r"\d+", answer_str) + if len(numbers) < 1: + return INVALID + try: + return ast.literal_eval(numbers[-1]) + except SyntaxError: + return INVALID + + +def run_gsm8k_benchmark( + base_url: str, + num_questions: int = 200, + num_shots: int = 5, + parallel: int = 64, +) -> Tuple[float, float, float]: + """Run GSM8K few-shot completion benchmark.""" + import sglang as sgl + from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint + + url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl" + data_path = download_and_cache_file(url) + lines = list(read_jsonl(data_path)) + + few_shot_examples = get_few_shot_examples(lines, num_shots) + + questions = [] + labels = [] + for i in range(len(lines[:num_questions])): + questions.append(get_one_example(lines, i, False)) + labels.append(get_answer_value(lines[i]["answer"])) + assert all(l != INVALID for l in labels) + arguments = [{"question": q} for q in questions] + + @sgl.function + def few_shot_gsm8k(s, question): + s += few_shot_examples + question + s += sgl.gen( + "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"] + ) + + backend = RuntimeEndpoint(base_url) + sgl.set_default_backend(backend) + + tic = time.perf_counter() + states = few_shot_gsm8k.run_batch( + arguments, + temperature=0, + num_threads=parallel, + progress_bar=True, + ) + latency = time.perf_counter() - tic + + preds = [] + for i in range(len(states)): + preds.append(get_answer_value(states[i]["answer"])) + + acc = np.mean(np.array(preds) == np.array(labels)) + invalid = np.mean(np.array(preds) == INVALID) + + return float(acc), float(invalid), float(latency) + + +def popen_launch_server_for_base_model( + base_url: str, + config: BaseModelConfig, +) -> "subprocess.Popen": + """Launch server for a base model with appropriate configuration.""" + env = os.environ.copy() + for key, value in config.env_vars.items(): + env[key] = value + print(f"Setting env: {key}={value}") + + other_args = list(config.other_args) + other_args.extend(["--tp", str(config.tp_size)]) + other_args.extend(["--log-level-http", "warning"]) + + if config.tokenizer_path: + other_args.extend(["--tokenizer-path", config.tokenizer_path]) + + timeout = config.timeout if config.timeout else DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH + + # Use effective model path (local if exists, else HF model ID) + effective_model_path = config.get_effective_model_path() + print(f"Using model path: {effective_model_path}") + + process = popen_launch_server( + model=effective_model_path, + base_url=base_url, + timeout=timeout, + other_args=other_args, + env=env, + ) + return process + + +class TestMI35xGsm8kCompletionEval(unittest.TestCase): + """MI35x GSM8K Completion Evaluation Test (8-GPU) + + Tests MI35x-specific base models using few-shot completion benchmark. + """ + + @classmethod + def setUpClass(cls): + cls.model_group = get_model_group() + cls.models = get_models_for_group(cls.model_group) + cls.base_url = DEFAULT_URL_FOR_TEST + cls.num_questions = int(os.environ.get("GSM8K_NUM_QUESTIONS", "200")) + + print(f"\n{'='*60}") + print(f"MI35x GSM8K Completion Evaluation Test (8-GPU)") + print(f"{'='*60}") + print(f"Model group: {cls.model_group}") + print(f"Models to test: {len(cls.models)}") + for m in cls.models: + print(f" - {m.model_path}") + print(f"Questions per model: {cls.num_questions}") + print(f"{'='*60}\n") + + def test_gsm8k_completion_all_models(self): + """Test all configured MI35x models with GSM8K completion benchmark.""" + all_results = [] + total_test_start = time.time() + + summary = f"### MI35x Model Group: {self.model_group}\n\n" + summary += ( + "| Model | TP | Accuracy | Threshold | Startup | Bench | Total | Status |\n" + ) + summary += ( + "| ----- | -- | -------- | --------- | ------- | ----- | ----- | ------ |\n" + ) + + for config in self.models: + display_name = config.get_display_name() + with self.subTest(model=display_name): + print(f"\n{'='*60}") + print(f"Testing: {display_name} (TP={config.tp_size})") + print(f"{'='*60}") + + error_message = None + acc, invalid, latency = None, None, None + startup_time, bench_time, total_time = None, None, None + model_start = time.time() + + # Check model availability with detailed logging + is_available, status_msg = log_model_status(config) + + if not is_available: + print(f"\n❌ MODEL NOT AVAILABLE: {status_msg}") + print(f"⏭️ SKIPPING: {display_name}") + status = "⏭️ SKIP" + all_results.append( + { + "model": display_name, + "tp_size": config.tp_size, + "accuracy": None, + "threshold": config.accuracy_threshold, + "passed": True, + "skipped": True, + "error": status_msg, + } + ) + else: + try: + print(f"\n🚀 Launching server for {display_name}...") + server_start = time.time() + process = popen_launch_server_for_base_model( + self.base_url, config + ) + startup_time = time.time() - server_start + print(f"⏱️ Server startup: {startup_time:.1f}s") + + try: + print( + f"📊 Running GSM8K benchmark ({self.num_questions} questions)..." + ) + bench_start = time.time() + for attempt in range(3): + try: + acc, invalid, latency = run_gsm8k_benchmark( + self.base_url, + num_questions=self.num_questions, + num_shots=5, + parallel=64, + ) + print( + f" Attempt {attempt + 1}: accuracy={acc:.3f}" + ) + if acc >= config.accuracy_threshold: + break + except Exception as e: + print(f" Attempt {attempt + 1} failed: {e}") + if attempt == 2: + raise + bench_time = time.time() - bench_start + total_time = time.time() - model_start + + passed = acc >= config.accuracy_threshold + status = "✅ PASS" if passed else "❌ FAIL" + + print( + f"\n📈 Results: accuracy={acc:.3f} (threshold: {config.accuracy_threshold})" + ) + print(f"⏱️ Total: {total_time:.1f}s") + + all_results.append( + { + "model": display_name, + "tp_size": config.tp_size, + "accuracy": acc, + "threshold": config.accuracy_threshold, + "startup_time": startup_time, + "bench_time": bench_time, + "total_time": total_time, + "passed": passed, + "skipped": False, + "error": None, + } + ) + + except Exception as e: + error_message = str(e) + total_time = time.time() - model_start + print(f"\n❌ Error: {error_message}") + status = "❌ ERROR" + all_results.append( + { + "model": display_name, + "tp_size": config.tp_size, + "accuracy": None, + "threshold": config.accuracy_threshold, + "passed": False, + "skipped": False, + "error": error_message, + } + ) + + finally: + print(f"\n🛑 Stopping server...") + kill_process_tree(process.pid) + + except Exception as e: + error_message = str(e) + total_time = time.time() - model_start + print(f"\n❌ Error launching server: {error_message}") + status = "❌ ERROR" + all_results.append( + { + "model": display_name, + "tp_size": config.tp_size, + "accuracy": None, + "threshold": config.accuracy_threshold, + "passed": False, + "skipped": False, + "error": error_message, + } + ) + + # Add to summary (use display name to show variant) + acc_str = f"{acc:.3f}" if acc is not None else "N/A" + startup_str = ( + f"{startup_time:.0f}s" if startup_time is not None else "N/A" + ) + bench_str = f"{bench_time:.0f}s" if bench_time is not None else "N/A" + total_str = f"{total_time:.0f}s" if total_time is not None else "N/A" + summary += f"| {display_name} | {config.tp_size} | {acc_str} | {config.accuracy_threshold} | {startup_str} | {bench_str} | {total_str} | {status} |\n" + + # Final summary + total_test_time = time.time() - total_test_start + failed_models = [ + r for r in all_results if not r["passed"] and not r.get("skipped", False) + ] + skipped_models = [r for r in all_results if r.get("skipped", False)] + passed_models = [ + r for r in all_results if r["passed"] and not r.get("skipped", False) + ] + + print(f"\n{'='*60}") + print(f"SUMMARY - MI35x Model Group: {self.model_group}") + print(f"{'='*60}") + print(summary) + print( + f"\n📊 Passed: {len(passed_models)} | Failed: {len(failed_models)} | Skipped: {len(skipped_models)}" + ) + print(f"⏱️ Total: {total_test_time:.1f}s ({total_test_time/60:.1f} min)") + + if is_in_ci(): + write_github_step_summary(summary) + + if failed_models: + failure_msg = "\n".join( + [ + f"- {r['model']}: {r.get('error', 'below threshold')}" + for r in failed_models + ] + ) + raise AssertionError(f"The following models failed:\n{failure_msg}") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/nightly/test_gsm8k_eval_amd.py b/test/registered/amd/nightly/test_gsm8k_eval_amd.py similarity index 96% rename from test/srt/nightly/test_gsm8k_eval_amd.py rename to test/registered/amd/nightly/test_gsm8k_eval_amd.py index 5a94a7780b35..5918c6e6e1f6 100644 --- a/test/srt/nightly/test_gsm8k_eval_amd.py +++ b/test/registered/amd/nightly/test_gsm8k_eval_amd.py @@ -1,3 +1,12 @@ +""" +AMD GSM8K Evaluation Test (Migrated from test/srt/nightly/) + +This test evaluates instruction-tuned models on the mgsm_en benchmark using chat completions. +Models are tested with various TP configurations on AMD GPUs. + +Registry: nightly-amd suite (2-GPU tests) +""" + import json import os import time @@ -6,6 +15,7 @@ from types import SimpleNamespace from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1, @@ -21,6 +31,9 @@ write_results_to_json, ) +# Register for AMD CI - GSM8K evaluation tests (~60 min) +register_amd_ci(est_time=3600, suite="nightly-amd", nightly=True) + MODEL_SCORE_THRESHOLDS = { # Llama 3.1 series "meta-llama/Llama-3.1-8B-Instruct": 0.82, diff --git a/test/srt/nightly/test_vlms_mmmu_eval_amd.py b/test/registered/amd/nightly/test_vlms_mmmu_eval_amd.py similarity index 97% rename from test/srt/nightly/test_vlms_mmmu_eval_amd.py rename to test/registered/amd/nightly/test_vlms_mmmu_eval_amd.py index 1df3b15bc437..d9438e6ca602 100644 --- a/test/srt/nightly/test_vlms_mmmu_eval_amd.py +++ b/test/registered/amd/nightly/test_vlms_mmmu_eval_amd.py @@ -1,5 +1,5 @@ """ -AMD VLM MMMU Evaluation Test +AMD VLM MMMU Evaluation Test (Migrated from test/srt/nightly/) This test evaluates Vision-Language Models (VLMs) on the MMMU benchmark on AMD GPUs. Models are selected based on compatibility with AMD/ROCm platform. @@ -11,6 +11,8 @@ - deepseek-vl2-small Note: Some VLMs from the Nvidia test are excluded due to AMD compatibility issues. + +Registry: nightly-amd-vlm suite (2-GPU VLM tests) """ import os @@ -20,6 +22,7 @@ from types import SimpleNamespace from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_amd_ci from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -30,6 +33,9 @@ write_results_to_json, ) +# Register for AMD CI - VLM MMMU evaluation tests (~120 min) +register_amd_ci(est_time=7200, suite="nightly-amd-vlm", nightly=True) + # AMD-verified VLM models with conservative thresholds on 100 MMMU samples # Format: (model_path, tp_size, accuracy_threshold, extra_args) AMD_VLM_MODELS = [ diff --git a/test/registered/amd/test_deepseek_r1_mxfp4_perf.py b/test/registered/amd/test_deepseek_r1_mxfp4_perf.py new file mode 100644 index 000000000000..e1a0de107d92 --- /dev/null +++ b/test/registered/amd/test_deepseek_r1_mxfp4_perf.py @@ -0,0 +1,166 @@ +"""Nightly performance benchmark for DeepSeek-R1-MXFP4 model (MI35x). + +This test benchmarks the DeepSeek-R1-MXFP4 quantized model on MI35x with 8 GPUs. + +The model path can be configured via DEEPSEEK_R1_MXFP4_MODEL_PATH environment variable. + +Example usage: + DEEPSEEK_R1_MXFP4_MODEL_PATH=/data2/models/amd-DeepSeek-R1-MXFP4-Preview python -m pytest test_deepseek_r1_mxfp4_perf.py -v +""" + +import os + +# Set HF cache to /data2/models/ for MI35x so HF models download there +os.environ.setdefault("HF_HOME", "/data2/models/huggingface") +os.environ.setdefault("HF_HUB_CACHE", "/data2/models/huggingface/hub") +import unittest +from typing import List + +from sglang.test.ci.ci_register import register_amd_ci +from sglang.test.nightly_bench_utils import BenchmarkResult +from sglang.test.nightly_utils import NightlyBenchmarkRunner +from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, _parse_int_list_env + +# Register for AMD CI - DeepSeek-R1-MXFP4 benchmark (~300 min) +register_amd_ci( + est_time=18000, suite="nightly-perf-8-gpu-deepseek-r1-mxfp4", nightly=True +) + + +def generate_simple_markdown_report(results: List[BenchmarkResult]) -> str: + """Generate a simplified markdown report without traces and cost columns.""" + model_header = results[0].model_path + if results[0].run_name and results[0].run_name != "default": + model_header += f" ({results[0].run_name})" + + gpu_config = os.getenv("GPU_CONFIG", "") + if gpu_config: + model_header += f" [{gpu_config}]" + + summary = f"### {model_header}\n" + summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | ITL (ms) |\n" + summary += "| ---------- | --------- | ----------- | ------------------------ | ------------------------- | -------- |\n" + + for result in results: + itl = 1 / (result.output_throughput / result.batch_size) * 1000 + summary += f"| {result.batch_size} | {result.input_len} | {result.latency:.2f} | {result.input_throughput:.2f} | {result.output_throughput:.2f} | {itl:.2f} |\n" + + return summary + + +# Model path configuration for MI35x DeepSeek-R1-MXFP4 +# Priority: 1) env var, 2) local path, 3) HuggingFace model ID +DEEPSEEK_R1_MXFP4_LOCAL_PATH = "/data2/models/amd-DeepSeek-R1-MXFP4-Preview" +DEEPSEEK_R1_MXFP4_HF_MODEL_ID = "amd/DeepSeek-R1-MXFP4-Preview" +PROFILE_DIR = "performance_profiles_deepseek_r1_mxfp4" + + +def get_model_path() -> str: + """Get effective model path: env var > local path > HF model ID.""" + # Check env var first + env_path = os.environ.get("DEEPSEEK_R1_MXFP4_MODEL_PATH") + if env_path: + return env_path + # Check local path + if os.path.exists(DEEPSEEK_R1_MXFP4_LOCAL_PATH): + return DEEPSEEK_R1_MXFP4_LOCAL_PATH + # Fall back to HF model ID + return DEEPSEEK_R1_MXFP4_HF_MODEL_ID + + +class TestNightlyDeepseekR1MXFP4Performance(unittest.TestCase): + """Nightly performance benchmark for DeepSeek-R1-MXFP4 model (MI35x). + + Tests the DeepSeek-R1-MXFP4 quantized model on TP=8 with DP=8. + Uses local path if available, otherwise downloads from HuggingFace. + """ + + @classmethod + def setUpClass(cls): + cls.model = get_model_path() + print(f"Using model path: {cls.model}") + cls.base_url = DEFAULT_URL_FOR_TEST + cls.batch_sizes = [1, 1, 8, 16, 64] + cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) + cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512")) + + # Define variant configurations for DeepSeek-R1-MXFP4 on MI35x + # Only run basic variant for perf (DP/TC/MTP covered in accuracy tests) + cls.variants = [ + { + "name": "basic", + "other_args": [ + "--trust-remote-code", + "--tp", + "8", + "--chunked-prefill-size", + "131072", + "--disable-radix-cache", + "--mem-fraction-static", + "0.85", + ], + }, + ] + + cls.runner = NightlyBenchmarkRunner(PROFILE_DIR, cls.__name__, cls.base_url) + cls.runner.setup_profile_directory() + # Override full_report to remove traces help text + cls.runner.full_report = f"## {cls.__name__}\n" + + def test_bench_one_batch(self): + """Run benchmark across all configured variants.""" + failed_variants = [] + + # For local paths, check if exists. HF model IDs will download automatically. + is_local_path = self.model.startswith("/") + if is_local_path and not os.path.exists(self.model): + print(f"\n⏭️ SKIPPING: Local model not found at {self.model}") + self.runner.full_report += ( + f"\n⏭️ Test skipped: Local model not found at {self.model}\n" + ) + self.runner.write_final_report() + return + + # Log model source + if is_local_path: + print(f"📁 Using local model: {self.model}") + else: + print( + f"📥 Using HuggingFace model: {self.model} (will download if not cached)" + ) + + try: + for variant_config in self.variants: + with self.subTest(variant=variant_config["name"]): + result_tuple = self.runner.run_benchmark_for_model( + model_path=self.model, + batch_sizes=self.batch_sizes, + input_lens=self.input_lens, + output_lens=self.output_lens, + other_args=variant_config["other_args"], + variant=variant_config["name"], + extra_bench_args=["--trust-remote-code"], + ) + results = result_tuple[0] + success = result_tuple[1] + + if not success: + failed_variants.append(variant_config["name"]) + + # Use simplified report format without traces + if results: + self.runner.full_report += ( + generate_simple_markdown_report(results) + "\n" + ) + finally: + self.runner.write_final_report() + + if failed_variants: + raise AssertionError( + f"Benchmark failed for {self.model} with the following variants: " + f"{', '.join(failed_variants)}" + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/run_suite.py b/test/run_suite.py index e2944781cb84..1a687b0f864e 100644 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -56,7 +56,13 @@ "nightly-perf-text-2-gpu", "nightly-perf-vlm-2-gpu", ], - HWBackend.AMD: ["nightly-amd", "nightly-amd-8-gpu"], + HWBackend.AMD: [ + "nightly-amd", + "nightly-amd-8-gpu", + "nightly-amd-vlm", + # MI35x 8-GPU suite (different model configs) + "nightly-amd-8-gpu-mi35x", + ], HWBackend.CPU: [], HWBackend.NPU: [ "nightly-1-npu-a3", diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 958c939b95d1..edd33cd18c85 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -115,17 +115,9 @@ "per-commit-8-gpu-amd-mi35x": [ TestFile("test_deepseek_r1_mxfp4_8gpu.py", 3600), ], - "nightly-amd": [ - TestFile("nightly/test_gsm8k_eval_amd.py"), - ], - # AMD VLM tests using MMMU benchmark (2-GPU runner) - "nightly-amd-vlm": [ - TestFile("nightly/test_vlms_mmmu_eval_amd.py"), - ], - # AMD 8-GPU tests for base models using gsm8k completion benchmark - "nightly-amd-8-gpu": [ - TestFile("nightly/test_gsm8k_completion_eval_amd.py"), - ], + # NOTE: AMD nightly suites (nightly-amd, nightly-amd-vlm, nightly-amd-8-gpu) + # have been migrated to test/registered/amd/nightly/ and are now managed + # by test/run_suite.py using the registry system. } # Add Intel Xeon tests