diff --git a/.github/workflows/nightly-test-amd-aiter-latest.yml b/.github/workflows/nightly-test-amd-aiter-latest.yml new file mode 100644 index 000000000000..0a985e4a8eaf --- /dev/null +++ b/.github/workflows/nightly-test-amd-aiter-latest.yml @@ -0,0 +1,1354 @@ +name: Test AITER Latest (AMD Nightly) + +on: + workflow_dispatch: + inputs: + aiter_ref: + description: 'AITER git ref to test (branch, tag, or SHA). Default: main (latest commit)' + required: false + type: string + default: 'main' + job_filter: + description: 'Select which job to run (leave empty or "all" to run all jobs)' + required: false + type: choice + default: 'all' + options: + - 'all' + # MI30x Unit Tests + - 'nightly-test-1-gpu-unit' + # MI30x Accuracy Tests (GSM8K / MMMU) + - 'nightly-accuracy-2-gpu' + - 'nightly-accuracy-2-gpu-vlm' + - 'nightly-perf-2-gpu-text' + - 'nightly-perf-2-gpu-vlm' + - 'nightly-accuracy-8-gpu' + # MI30x Accuracy + Performance Tests (combined) + - 'nightly-8-gpu-grok1-int4' + - 'nightly-8-gpu-grok2' + - 'nightly-8-gpu-deepseek-v31' + - 'nightly-8-gpu-deepseek-v32' + - 'nightly-8-gpu-deepseek-v32-mtp' + - 'nightly-8-gpu-kimi-k25' + - 'nightly-8-gpu-qwen3-235b' + - 'nightly-8-gpu-glm5' + # MI35x jobs + - 'nightly-test-1-gpu-mi35x' + - 'nightly-8-gpu-mi35x-qwen3-235b-mxfp4' + - 'nightly-8-gpu-mi35x-kimi-k25' + - 'nightly-8-gpu-mi35x-glm5' + - 'nightly-accuracy-8-gpu-mi35x' + - 'nightly-8-gpu-mi35x-grok1-int4' + - 'nightly-8-gpu-mi35x-grok2' + - 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4' + - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32' + - 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp' + - 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic' + - 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp' + +concurrency: + group: nightly-test-amd-aiter-latest-${{ github.run_id }} + cancel-in-progress: true + +jobs: + # ============================================== Resolve AITER Ref ============================================== + resolve-aiter: + runs-on: ubuntu-latest + outputs: + aiter_sha: ${{ steps.resolve.outputs.sha }} + aiter_short_sha: ${{ steps.resolve.outputs.short_sha }} + aiter_ref: ${{ steps.resolve.outputs.ref }} + steps: + - name: Resolve AITER commit + id: resolve + run: | + REF="${{ inputs.aiter_ref || 'main' }}" + echo "Resolving AITER ref: ${REF}" + + SHA=$(git ls-remote https://github.com/ROCm/aiter.git "${REF}" | head -1 | cut -f1) + if [ -z "$SHA" ]; then + SHA=$(git ls-remote https://github.com/ROCm/aiter.git "refs/heads/${REF}" | head -1 | cut -f1) + fi + if [ -z "$SHA" ]; then + SHA=$(git ls-remote https://github.com/ROCm/aiter.git "refs/tags/${REF}" | head -1 | cut -f1) + fi + if [ -z "$SHA" ]; then + SHA="${REF}" + fi + + SHORT_SHA="${SHA:0:12}" + echo "sha=${SHA}" >> $GITHUB_OUTPUT + echo "short_sha=${SHORT_SHA}" >> $GITHUB_OUTPUT + echo "ref=${REF}" >> $GITHUB_OUTPUT + + echo "### AITER Ref Resolution" >> $GITHUB_STEP_SUMMARY + echo "- **Requested ref:** \`${REF}\`" >> $GITHUB_STEP_SUMMARY + echo "- **Resolved SHA:** \`${SHA}\`" >> $GITHUB_STEP_SUMMARY + echo "- **AITER repo:** https://github.com/ROCm/aiter/commit/${SHA}" >> $GITHUB_STEP_SUMMARY + + # ============================================== MI30x Unit Tests ============================================== + nightly-test-1-gpu-unit: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-unit' + runs-on: linux-mi325-gpu-1 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + echo "Overriding AITER_COMMIT to: ${AITER_SHA}" + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang pip show amd-aiter 2>/dev/null >> $GITHUB_STEP_SUMMARY || echo "amd-aiter not installed" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --format='%H %s'" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Nightly Unit Test (1-GPU) + timeout-minutes: 90 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 900 --continue-on-error || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== MI30x Accuracy Tests ============================================== + nightly-accuracy-2-gpu: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu' + runs-on: linux-mi325-gpu-2 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Nightly Test (2-GPU) + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-accuracy-2-gpu-vlm: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-vlm' + runs-on: linux-mi325-gpu-2 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Nightly Accuracy Test (2-GPU VLM MMMU) + timeout-minutes: 180 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-perf-2-gpu-text: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-text' + runs-on: linux-mi325-gpu-2 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Performance Test (2-GPU Text Models) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-perf-2-gpu-vlm: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-2-gpu-vlm' + runs-on: linux-mi325-gpu-2 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Performance Test (2-GPU VLM Models) + timeout-minutes: 180 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-accuracy-8-gpu: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test (8-GPU GPT-OSS) + timeout-minutes: 180 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Accuracy Test (8-GPU Grok1-FP8) + timeout-minutes: 60 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== MI30x Combined Accuracy + Performance Tests ============================================== + nightly-8-gpu-grok1-int4: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok1-int4' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test (8-GPU Grok1-INT4) + timeout-minutes: 60 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test (8-GPU Grok1-INT4) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-grok2: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok2' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test (8-GPU Grok2) + timeout-minutes: 60 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test (8-GPU Grok2) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-deepseek-v31: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test (8-GPU DeepSeek-V3.1) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_AITER=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test (8-GPU DeepSeek-V3.1) + timeout-minutes: 300 + continue-on-error: true + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e SGLANG_USE_ROCM700A=1 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-deepseek-v32: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test (8-GPU DeepSeek-V3.2 Basic) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test (8-GPU DeepSeek-V3.2 Basic) + timeout-minutes: 150 + continue-on-error: true + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-deepseek-v32-mtp: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v32-mtp' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test (8-GPU DeepSeek-V3.2 MTP) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test (8-GPU DeepSeek-V3.2 MTP) + timeout-minutes: 180 + continue-on-error: true + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-kimi-k25: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-kimi-k25' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test (8-GPU Kimi-K2.5) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k25 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-qwen3-235b: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-qwen3-235b' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test + Performance Test (8-GPU Qwen3) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-8-gpu-qwen3-235b --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-glm5: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-glm5' + runs-on: linux-mi325-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test (8-GPU GLM-5 NSA) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-glm5 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== MI35x Tests ============================================== + nightly-test-1-gpu-mi35x: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-mi35x' + runs-on: linux-mi35x-gpu-1 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Nightly Test MI35x (1-GPU) + timeout-minutes: 90 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 900 --continue-on-error || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-accuracy-8-gpu-mi35x: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test MI35x (8-GPU GPT-OSS) + timeout-minutes: 180 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-mi35x-grok1-int4: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok1-int4' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test MI35x (8-GPU Grok1-INT4) + timeout-minutes: 90 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x (8-GPU Grok1-INT4) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-mi35x-grok2: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok2' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test MI35x (8-GPU Grok2) + timeout-minutes: 60 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x (8-GPU Grok2) + timeout-minutes: 60 + continue-on-error: true + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e RCCL_MSCCL_ENABLE=0 \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-mi35x-deepseek-r1-mxfp4: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test MI35x (8-GPU DeepSeek-R1-MXFP4) + timeout-minutes: 180 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + - name: Performance Test MI35x (8-GPU DeepSeek-R1-MXFP4) + timeout-minutes: 300 + continue-on-error: true + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_perf_mi35x.py || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-accuracy-8-gpu-mi35x-deepseek-v32: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test MI35x (8-GPU DeepSeek-V3.2) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test MI35x (8-GPU DeepSeek-V3.2 TP+MTP) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-perf-8-gpu-mi35x-deepseek-v32-basic: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-basic' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Performance Test MI35x (8-GPU DeepSeek-V3.2 Basic) + timeout-minutes: 150 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-mi35x-kimi-k25: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-kimi-k25' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test MI35x (8-GPU Kimi-K2.5) + timeout-minutes: 180 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k25 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-mi35x-qwen3-235b-mxfp4: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-qwen3-235b-mxfp4' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test + Performance Test MI35x (8-GPU Qwen3-235B-MXFP4) + timeout-minutes: 120 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-8-gpu-mi35x-qwen3-235b-mxfp4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-8-gpu-mi35x-glm5: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-glm5' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Accuracy Test MI35x (8-GPU GLM-5 NSA) + timeout-minutes: 180 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-glm5 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + nightly-perf-8-gpu-mi35x-deepseek-v32-mtp: + needs: resolve-aiter + if: inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp' + runs-on: linux-mi35x-gpu-8 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Override AITER commit in Dockerfile + run: | + AITER_SHA="${{ needs.resolve-aiter.outputs.aiter_sha }}" + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm.Dockerfile + sed -i "s/AITER_COMMIT=\"[^\"]*\"/AITER_COMMIT=\"${AITER_SHA}\"/g" docker/rocm720.Dockerfile + + - name: Setup docker + run: | + touch github_summary.md + bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: | + bash scripts/ci/amd/amd_ci_install_dependency.sh + bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate + + - name: Verify AITER version + run: | + echo "### AITER Version Info" >> $GITHUB_STEP_SUMMARY + echo "- **Target:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\` (\`${{ needs.resolve-aiter.outputs.aiter_short_sha }}\`)" >> $GITHUB_STEP_SUMMARY + docker exec ci_sglang bash -c "cd /sgl-workspace/aiter 2>/dev/null && git log -1 --oneline" >> $GITHUB_STEP_SUMMARY 2>/dev/null || true + + - name: Performance Test MI35x (8-GPU DeepSeek-V3.2 MTP) + timeout-minutes: 180 + run: | + > github_summary.md + bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ + -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + echo "$(> $GITHUB_STEP_SUMMARY || true + exit ${TEST_EXIT_CODE:-0} + + # ============================================== Summary ============================================== + check-all-jobs: + if: always() + needs: + - resolve-aiter + # MI30x Unit Tests + - nightly-test-1-gpu-unit + # MI30x Accuracy Tests + - nightly-accuracy-2-gpu + - nightly-accuracy-2-gpu-vlm + - nightly-accuracy-8-gpu + # MI30x Combined Accuracy + Performance Tests + - nightly-8-gpu-grok1-int4 + - nightly-8-gpu-grok2 + - nightly-8-gpu-deepseek-v31 + - nightly-8-gpu-deepseek-v32 + - nightly-8-gpu-deepseek-v32-mtp + - nightly-8-gpu-kimi-k25 + - nightly-8-gpu-qwen3-235b + - nightly-8-gpu-glm5 + # MI35x jobs + - nightly-test-1-gpu-mi35x + - nightly-accuracy-8-gpu-mi35x + - nightly-8-gpu-mi35x-grok1-int4 + - nightly-8-gpu-mi35x-grok2 + - nightly-8-gpu-mi35x-deepseek-r1-mxfp4 + - nightly-accuracy-8-gpu-mi35x-deepseek-v32 + - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp + - nightly-8-gpu-mi35x-kimi-k25 + - nightly-8-gpu-mi35x-qwen3-235b-mxfp4 + - nightly-8-gpu-mi35x-glm5 + runs-on: ubuntu-latest + steps: + - name: Summary + run: | + echo "## AITER Latest Test Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **AITER ref:** \`${{ needs.resolve-aiter.outputs.aiter_ref }}\`" >> $GITHUB_STEP_SUMMARY + echo "- **AITER SHA:** \`${{ needs.resolve-aiter.outputs.aiter_sha }}\`" >> $GITHUB_STEP_SUMMARY + echo "- **AITER commit:** https://github.com/ROCm/aiter/commit/${{ needs.resolve-aiter.outputs.aiter_sha }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### MI30x Results" >> $GITHUB_STEP_SUMMARY + echo "| Job | Result |" >> $GITHUB_STEP_SUMMARY + echo "|-----|--------|" >> $GITHUB_STEP_SUMMARY + echo "| nightly-test-1-gpu-unit | \`${{ needs.nightly-test-1-gpu-unit.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-accuracy-2-gpu | \`${{ needs.nightly-accuracy-2-gpu.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-accuracy-2-gpu-vlm | \`${{ needs.nightly-accuracy-2-gpu-vlm.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-perf-2-gpu-text | \`${{ needs.nightly-perf-2-gpu-text.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-perf-2-gpu-vlm | \`${{ needs.nightly-perf-2-gpu-vlm.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-accuracy-8-gpu | \`${{ needs.nightly-accuracy-8-gpu.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-grok1-int4 | \`${{ needs.nightly-8-gpu-grok1-int4.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-grok2 | \`${{ needs.nightly-8-gpu-grok2.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-deepseek-v31 | \`${{ needs.nightly-8-gpu-deepseek-v31.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-deepseek-v32 | \`${{ needs.nightly-8-gpu-deepseek-v32.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-deepseek-v32-mtp | \`${{ needs.nightly-8-gpu-deepseek-v32-mtp.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-kimi-k25 | \`${{ needs.nightly-8-gpu-kimi-k25.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-qwen3-235b | \`${{ needs.nightly-8-gpu-qwen3-235b.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-glm5 | \`${{ needs.nightly-8-gpu-glm5.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### MI35x Results" >> $GITHUB_STEP_SUMMARY + echo "| Job | Result |" >> $GITHUB_STEP_SUMMARY + echo "|-----|--------|" >> $GITHUB_STEP_SUMMARY + echo "| nightly-test-1-gpu-mi35x | \`${{ needs.nightly-test-1-gpu-mi35x.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-accuracy-8-gpu-mi35x | \`${{ needs.nightly-accuracy-8-gpu-mi35x.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-mi35x-grok1-int4 | \`${{ needs.nightly-8-gpu-mi35x-grok1-int4.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-mi35x-grok2 | \`${{ needs.nightly-8-gpu-mi35x-grok2.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-mi35x-deepseek-r1-mxfp4 | \`${{ needs.nightly-8-gpu-mi35x-deepseek-r1-mxfp4.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-accuracy-8-gpu-mi35x-deepseek-v32 | \`${{ needs.nightly-accuracy-8-gpu-mi35x-deepseek-v32.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp | \`${{ needs.nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-perf-8-gpu-mi35x-deepseek-v32-basic | \`${{ needs.nightly-perf-8-gpu-mi35x-deepseek-v32-basic.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-perf-8-gpu-mi35x-deepseek-v32-mtp | \`${{ needs.nightly-perf-8-gpu-mi35x-deepseek-v32-mtp.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-mi35x-kimi-k25 | \`${{ needs.nightly-8-gpu-mi35x-kimi-k25.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-mi35x-qwen3-235b-mxfp4 | \`${{ needs.nightly-8-gpu-mi35x-qwen3-235b-mxfp4.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| nightly-8-gpu-mi35x-glm5 | \`${{ needs.nightly-8-gpu-mi35x-glm5.result }}\` |" >> $GITHUB_STEP_SUMMARY + + - name: Check if any job failed + run: | + if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then + echo "One or more test jobs failed with AITER ref: ${{ needs.resolve-aiter.outputs.aiter_ref }}" + exit 1 + fi + if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then + echo "One or more test jobs were cancelled" + exit 1 + fi + echo "All test jobs passed with AITER ${{ needs.resolve-aiter.outputs.aiter_ref }} (${{ needs.resolve-aiter.outputs.aiter_short_sha }})" diff --git a/.github/workflows/nightly-test-amd-rocm720.yml b/.github/workflows/nightly-test-amd-rocm720.yml index a1d921ef4447..6f35b4abac17 100644 --- a/.github/workflows/nightly-test-amd-rocm720.yml +++ b/.github/workflows/nightly-test-amd-rocm720.yml @@ -10,6 +10,16 @@ on: - "python/sglang/version.py" workflow_dispatch: inputs: + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true job_filter: description: 'Select which job to run (leave empty or "all" to run all jobs)' required: false @@ -56,11 +66,24 @@ on: required: false type: string default: '' + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' job_filter: description: 'Select which job to run (leave empty or "all" to run all jobs)' required: false type: string default: 'all' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true + +env: + AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} concurrency: group: nightly-test-amd-rocm720-${{ inputs.ref || github.ref }} @@ -86,14 +109,13 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Nightly Unit Test ROCm 7.2 (1-GPU) timeout-minutes: 90 run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 900 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 900 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -116,14 +138,13 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Nightly Test ROCm 7.2 (2-GPU) run: | > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -145,15 +166,14 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Nightly Accuracy Test ROCm 7.2 (2-GPU VLM MMMU) timeout-minutes: 180 run: | > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -175,8 +195,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Performance Test ROCm 7.2 (2-GPU Text Models) timeout-minutes: 120 run: | @@ -184,7 +203,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -206,8 +225,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Performance Test ROCm 7.2 (2-GPU VLM Models) timeout-minutes: 180 run: | @@ -215,7 +233,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -237,14 +255,14 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps - name: Accuracy Test ROCm 7.2 (8-GPU GPT-OSS) timeout-minutes: 180 run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -254,7 +272,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -277,7 +295,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps - name: Accuracy Test ROCm 7.2 (8-GPU Grok1-INT4) timeout-minutes: 60 @@ -286,7 +304,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -298,7 +316,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -320,7 +338,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps - name: Accuracy Test ROCm 7.2 (8-GPU Grok2) timeout-minutes: 60 @@ -329,7 +347,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -341,7 +359,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -363,7 +381,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.1) timeout-minutes: 120 @@ -372,7 +390,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -384,7 +402,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_ROCM700A=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -406,7 +424,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.2 Basic) timeout-minutes: 120 @@ -414,7 +432,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -425,7 +443,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -447,7 +465,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps - name: Accuracy Test ROCm 7.2 (8-GPU DeepSeek-V3.2 MTP) timeout-minutes: 120 @@ -455,7 +473,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -466,7 +484,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -488,7 +506,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps - name: Accuracy Test ROCm 7.2 (8-GPU Kimi-K2.5) timeout-minutes: 120 @@ -496,7 +514,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k25 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -518,7 +536,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps - name: Accuracy Test + Performance Test ROCm 7.2 (8-GPU Qwen3) timeout-minutes: 120 @@ -526,7 +544,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-8-gpu-qwen3-235b --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-8-gpu-qwen3-235b --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -549,7 +567,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # GLM-5 requires latest transformers for glm_moe_dsa architecture bash scripts/ci/amd/amd_ci_exec.sh pip install git+https://github.com/huggingface/transformers.git @@ -559,7 +577,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-glm5 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-glm5 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -581,7 +599,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps - name: Accuracy Test ROCm 7.2 (8-GPU MiniMax-M2.5) timeout-minutes: 120 @@ -590,7 +608,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-minimax-m25 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-minimax-m25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -614,14 +632,13 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Nightly Test MI35x ROCm 7.2 (1-GPU) timeout-minutes: 90 run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 900 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 900 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -644,7 +661,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate @@ -653,7 +670,7 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -676,7 +693,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate @@ -687,7 +704,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -699,7 +716,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -722,7 +739,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate @@ -733,7 +750,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -745,7 +762,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -768,7 +785,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate @@ -778,7 +795,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -812,7 +829,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate @@ -822,7 +839,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -845,7 +862,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate @@ -855,7 +872,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -878,7 +895,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate @@ -888,7 +905,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -911,7 +928,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate @@ -921,7 +938,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k25 --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k25 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -944,7 +961,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate @@ -954,7 +971,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-8-gpu-mi35x-qwen3-235b-mxfp4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-8-gpu-mi35x-qwen3-235b-mxfp4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -976,7 +993,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate # GLM-5 requires latest transformers for glm_moe_dsa architecture @@ -988,7 +1005,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-glm5 --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-glm5 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -1011,7 +1028,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate - name: Accuracy Test MI35x ROCm 7.2 (8-GPU MiniMax-M2.5) @@ -1021,7 +1038,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-minimax-m25 --nightly --timeout-per-file 3600 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-minimax-m25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -1044,7 +1061,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build --skip-test-time-deps + bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps # Install tabulate for run_suite.py (missing in MI35x container) bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate @@ -1054,7 +1071,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 7200 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} diff --git a/.github/workflows/nightly-test-amd.yml b/.github/workflows/nightly-test-amd.yml index fee10263b62e..410478caeac1 100644 --- a/.github/workflows/nightly-test-amd.yml +++ b/.github/workflows/nightly-test-amd.yml @@ -10,6 +10,16 @@ on: - "python/sglang/version.py" workflow_dispatch: inputs: + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true job_filter: description: 'Select which job to run (leave empty or "all" to run all jobs)' required: false @@ -56,11 +66,24 @@ on: required: false type: string default: '' + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' job_filter: description: 'Select which job to run (leave empty or "all" to run all jobs)' required: false type: string default: 'all' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true + +env: + AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} concurrency: group: nightly-test-amd-${{ inputs.ref || github.ref }} @@ -93,7 +116,7 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 900 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 900 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -123,7 +146,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -153,7 +176,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -184,7 +207,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-perf-text-2-gpu --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -215,7 +238,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-perf-vlm-2-gpu --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -244,7 +267,7 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -254,7 +277,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -286,7 +309,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -298,7 +321,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -329,7 +352,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -341,7 +364,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -372,7 +395,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -384,7 +407,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_ROCM700A=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -414,7 +437,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -425,7 +448,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-basic --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -455,7 +478,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -466,7 +489,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v32-mtp --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -496,7 +519,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k25 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-kimi-k25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -525,7 +548,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-8-gpu-qwen3-235b --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-8-gpu-qwen3-235b --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -557,7 +580,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-glm5 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-glm5 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -588,7 +611,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-minimax-m25 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-minimax-m25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -621,7 +644,7 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 900 --continue-on-error || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 900 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -653,7 +676,7 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -687,7 +710,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -699,7 +722,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -733,7 +756,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -745,7 +768,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e RCCL_MSCCL_ENABLE=0 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -778,7 +801,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -822,7 +845,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v32 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -855,7 +878,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -888,7 +911,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-basic --nightly --timeout-per-file 5400 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -921,7 +944,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k25 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k25 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -954,7 +977,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-8-gpu-mi35x-qwen3-235b-mxfp4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-8-gpu-mi35x-qwen3-235b-mxfp4 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -988,7 +1011,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-glm5 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-glm5 --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -1021,7 +1044,7 @@ jobs: bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e SGLANG_USE_AITER=1 \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-minimax-m25 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-minimax-m25 --nightly --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} @@ -1054,7 +1077,7 @@ jobs: > github_summary.md # Clear summary file bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ - python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? + python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-deepseek-v32-mtp --nightly --timeout-per-file 7200 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? echo "$(> $GITHUB_STEP_SUMMARY || true exit ${TEST_EXIT_CODE:-0} diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml index 1c9d6edf37ed..a4a2c9c8eb6d 100644 --- a/.github/workflows/pr-test-amd-rocm720.yml +++ b/.github/workflows/pr-test-amd-rocm720.yml @@ -37,6 +37,16 @@ on: required: false type: string default: "" + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true workflow_call: inputs: ref: @@ -49,6 +59,19 @@ on: required: false type: boolean default: false + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: true + +env: + AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} concurrency: # Include pr_head_sha in group for /rerun-stage dispatches to avoid collisions with main branch runs @@ -146,8 +169,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 14 run: | @@ -193,8 +215,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 20 run: | @@ -236,12 +257,11 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 10 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd --continue-on-error + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd ${{ inputs.continue_on_error && '--continue-on-error' || '' }} jit-kernel-unit-test-amd: needs: [check-changes] @@ -275,8 +295,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run JIT kernel unit tests timeout-minutes: 10 run: | @@ -315,12 +334,11 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 --continue-on-error + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-small-1-gpu-amd-nondeterministic: needs: [check-changes] @@ -354,12 +372,11 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 --continue-on-error + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-small-1-gpu-amd-mi35x: needs: [check-changes] @@ -393,12 +410,11 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x --continue-on-error + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-large-1-gpu-amd: needs: [check-changes] @@ -433,12 +449,11 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 --continue-on-error + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-large-2-gpu-amd: needs: [check-changes] @@ -473,12 +488,11 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 --continue-on-error + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} multimodal-gen-test-1-gpu-amd: needs: [check-changes] @@ -523,7 +537,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build diffusion + bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion docker exec ci_sglang pip install amdsmi - name: Setup kernel caches @@ -653,7 +667,7 @@ jobs: - name: Install dependencies run: | - bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build diffusion + bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion docker exec ci_sglang pip install amdsmi - name: Setup kernel caches @@ -774,8 +788,7 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Test RCCL multi-GPU communication timeout-minutes: 5 run: | @@ -785,7 +798,7 @@ jobs: - name: Run test timeout-minutes: 60 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 --continue-on-error + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-c-test-large-8-gpu-amd-mi35x: needs: [check-changes] @@ -820,12 +833,11 @@ jobs: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies - run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build - + run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 60 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 --continue-on-error + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} pr-test-amd-finish: needs: diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 28691f5f5fc8..df1bcdb64b89 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -34,6 +34,16 @@ on: required: false type: string default: "" + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: false workflow_call: inputs: ref: @@ -46,6 +56,19 @@ on: required: false type: boolean default: false + aiter_ref: + description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' + required: false + type: string + default: '' + continue_on_error: + description: 'Continue on error (do not fail the workflow on test failures)' + required: false + type: boolean + default: false + +env: + AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} concurrency: # Include pr_head_sha in group for /rerun-stage dispatches to avoid collisions with main branch runs @@ -238,7 +261,7 @@ jobs: - name: Run test timeout-minutes: 10 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd ${{ inputs.continue_on_error && '--continue-on-error' || '' }} jit-kernel-unit-test-amd: needs: [check-changes] @@ -317,7 +340,7 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-small-1-gpu-amd-nondeterministic: needs: [check-changes, stage-a-test-1-amd] @@ -356,7 +379,7 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 --continue-on-error + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-small-1-gpu-amd-mi35x: needs: [check-changes, stage-a-test-1-amd] @@ -395,7 +418,7 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-large-1-gpu-amd: needs: [check-changes, stage-a-test-1-amd] @@ -435,7 +458,7 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-large-2-gpu-amd: needs: [check-changes, stage-a-test-1-amd] @@ -475,7 +498,7 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} multimodal-gen-test-1-gpu-amd: needs: [check-changes] @@ -762,7 +785,7 @@ jobs: - name: Run test timeout-minutes: 60 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-c-test-large-8-gpu-amd-mi35x: needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd] @@ -802,7 +825,7 @@ jobs: - name: Run test timeout-minutes: 60 run: | - bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} # =============================================== Disaggregation ==================================================== stage-b-test-large-8-gpu-35x-disaggregation-amd: @@ -914,7 +937,7 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh \ -e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \ - -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 + -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} pr-test-amd-finish: needs: diff --git a/.github/workflows/scheduled-aiter-latest-full-test.yml b/.github/workflows/scheduled-aiter-latest-full-test.yml new file mode 100644 index 000000000000..f42ec1b0813b --- /dev/null +++ b/.github/workflows/scheduled-aiter-latest-full-test.yml @@ -0,0 +1,159 @@ +name: Scheduled Full AMD Test (AITER Latest) + +on: + schedule: + - cron: '0 20 * * 1' # Monday 20:00 UTC + - cron: '0 20 * * 4' # Thursday 20:00 UTC + workflow_dispatch: + inputs: + aiter_ref: + description: 'AITER git ref (branch, tag, or SHA). Default: main (latest commit)' + required: false + type: string + default: 'main' + job_filter: + description: 'Comma-separated workflows to run: nightly-amd, nightly-amd-rocm720, pr-test-amd, pr-test-amd-rocm720. Default: all' + required: false + type: string + default: 'all' + continue_on_error: + description: 'Continue running other workflows even if one fails' + required: false + type: boolean + default: true + +concurrency: + group: scheduled-aiter-latest-full-test-${{ github.run_id }} + cancel-in-progress: true + +jobs: + resolve-aiter: + runs-on: ubuntu-latest + outputs: + aiter_sha: ${{ steps.resolve.outputs.sha }} + run_nightly_amd: ${{ steps.parse.outputs.run_nightly_amd }} + run_nightly_amd_rocm720: ${{ steps.parse.outputs.run_nightly_amd_rocm720 }} + run_pr_test_amd: ${{ steps.parse.outputs.run_pr_test_amd }} + run_pr_test_amd_rocm720: ${{ steps.parse.outputs.run_pr_test_amd_rocm720 }} + steps: + - name: Resolve AITER commit + id: resolve + run: | + REF="${{ inputs.aiter_ref || 'main' }}" + echo "Resolving AITER ref: ${REF}" + + SHA=$(git ls-remote https://github.com/ROCm/aiter.git "${REF}" | head -1 | cut -f1) + if [ -z "$SHA" ]; then + SHA=$(git ls-remote https://github.com/ROCm/aiter.git "refs/heads/${REF}" | head -1 | cut -f1) + fi + if [ -z "$SHA" ]; then + SHA=$(git ls-remote https://github.com/ROCm/aiter.git "refs/tags/${REF}" | head -1 | cut -f1) + fi + if [ -z "$SHA" ]; then + SHA="${REF}" + fi + + echo "sha=${SHA}" >> $GITHUB_OUTPUT + echo "### AITER Ref Resolution" >> $GITHUB_STEP_SUMMARY + echo "- **Requested ref:** \`${REF}\`" >> $GITHUB_STEP_SUMMARY + echo "- **Resolved SHA:** \`${SHA}\`" >> $GITHUB_STEP_SUMMARY + echo "- **AITER commit:** https://github.com/ROCm/aiter/commit/${SHA}" >> $GITHUB_STEP_SUMMARY + + - name: Parse job filter + id: parse + run: | + FILTER="${{ inputs.job_filter || 'all' }}" + echo "Job filter: ${FILTER}" + + if [[ "$FILTER" == "all" ]]; then + echo "run_nightly_amd=true" >> $GITHUB_OUTPUT + echo "run_nightly_amd_rocm720=true" >> $GITHUB_OUTPUT + echo "run_pr_test_amd=true" >> $GITHUB_OUTPUT + echo "run_pr_test_amd_rocm720=true" >> $GITHUB_OUTPUT + else + # Wrap with commas for exact substring matching (avoids "nightly-amd" matching "nightly-amd-rocm720") + PADDED=",${FILTER// /}," + echo "run_nightly_amd=$(echo "$PADDED" | grep -q ',nightly-amd,' && echo true || echo false)" >> $GITHUB_OUTPUT + echo "run_nightly_amd_rocm720=$(echo "$PADDED" | grep -q ',nightly-amd-rocm720,' && echo true || echo false)" >> $GITHUB_OUTPUT + echo "run_pr_test_amd=$(echo "$PADDED" | grep -q ',pr-test-amd,' && echo true || echo false)" >> $GITHUB_OUTPUT + echo "run_pr_test_amd_rocm720=$(echo "$PADDED" | grep -q ',pr-test-amd-rocm720,' && echo true || echo false)" >> $GITHUB_OUTPUT + fi + + echo "### Job Filter" >> $GITHUB_STEP_SUMMARY + echo "- **Filter:** \`${FILTER}\`" >> $GITHUB_STEP_SUMMARY + + call-nightly-amd: + if: needs.resolve-aiter.outputs.run_nightly_amd == 'true' + needs: resolve-aiter + uses: ./.github/workflows/nightly-test-amd.yml + secrets: inherit + with: + aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }} + job_filter: 'all' + continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }} + + call-nightly-amd-rocm720: + if: needs.resolve-aiter.outputs.run_nightly_amd_rocm720 == 'true' + needs: resolve-aiter + uses: ./.github/workflows/nightly-test-amd-rocm720.yml + secrets: inherit + with: + aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }} + job_filter: 'all' + continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }} + + call-pr-test-amd: + if: needs.resolve-aiter.outputs.run_pr_test_amd == 'true' + needs: resolve-aiter + uses: ./.github/workflows/pr-test-amd.yml + secrets: inherit + with: + run_all_tests: true + aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }} + continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }} + + call-pr-test-amd-rocm720: + if: needs.resolve-aiter.outputs.run_pr_test_amd_rocm720 == 'true' + needs: resolve-aiter + uses: ./.github/workflows/pr-test-amd-rocm720.yml + secrets: inherit + with: + run_all_tests: true + aiter_ref: ${{ needs.resolve-aiter.outputs.aiter_sha }} + continue_on_error: ${{ inputs.continue_on_error == '' && true || inputs.continue_on_error }} + + check-all-jobs: + if: always() + needs: + - resolve-aiter + - call-nightly-amd + - call-nightly-amd-rocm720 + - call-pr-test-amd + - call-pr-test-amd-rocm720 + runs-on: ubuntu-latest + steps: + - name: Summary + run: | + echo "## Scheduled Full AMD Test (AITER Latest) Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **AITER SHA:** \`${{ needs.resolve-aiter.outputs.aiter_sha }}\`" >> $GITHUB_STEP_SUMMARY + echo "- **AITER commit:** https://github.com/ROCm/aiter/commit/${{ needs.resolve-aiter.outputs.aiter_sha }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Workflow | Result |" >> $GITHUB_STEP_SUMMARY + echo "|----------|--------|" >> $GITHUB_STEP_SUMMARY + echo "| Nightly AMD (AITER Latest) | \`${{ needs.call-nightly-amd.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| Nightly AMD ROCm 7.2 | \`${{ needs.call-nightly-amd-rocm720.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| PR Test AMD (AITER Latest) | \`${{ needs.call-pr-test-amd.result }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| PR Test AMD ROCm 7.2 | \`${{ needs.call-pr-test-amd-rocm720.result }}\` |" >> $GITHUB_STEP_SUMMARY + + - name: Check if any job failed + run: | + if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then + echo "One or more workflows failed" + exit 1 + fi + if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then + echo "One or more workflows were cancelled" + exit 1 + fi + echo "All workflows passed" diff --git a/scripts/ci/amd/amd_ci_install_dependency.sh b/scripts/ci/amd/amd_ci_install_dependency.sh index c4e948f35e32..fd06d9026ad9 100755 --- a/scripts/ci/amd/amd_ci_install_dependency.sh +++ b/scripts/ci/amd/amd_ci_install_dependency.sh @@ -230,7 +230,11 @@ echo "[CI-AITER-CHECK] AITER version inside CI image: ${IMAGE_AITER_VERSION}" ############################################# NEED_REBUILD="false" -if [[ "${IMAGE_AITER_VERSION}" == "vnone" || "${IMAGE_AITER_VERSION}" == "v" ]]; then +if [[ -n "${AITER_COMMIT_OVERRIDE:-}" ]]; then + echo "[CI-AITER-CHECK] AITER_COMMIT_OVERRIDE=${AITER_COMMIT_OVERRIDE} → forcing rebuild" + REPO_AITER_COMMIT="${AITER_COMMIT_OVERRIDE}" + NEED_REBUILD="true" +elif [[ "${IMAGE_AITER_VERSION}" == "vnone" || "${IMAGE_AITER_VERSION}" == "v" ]]; then echo "[CI-AITER-CHECK] No AITER found in image → rebuild needed" NEED_REBUILD="true" elif [[ "${IMAGE_AITER_VERSION}" == "${REPO_AITER_COMMIT}" ]]; then @@ -274,6 +278,24 @@ if [[ "${NEED_REBUILD}" == "true" ]]; then fi echo "[CI-AITER-CHECK] GPU_ARCH_LIST=${GPU_ARCH_LIST}" + # Re-apply Dockerfile hotpatches for ROCm 7.2 (the fresh clone lost them, can be removed after triton fixed this problem) + ROCM_VERSION=$(docker exec ci_sglang bash -c "cat /opt/rocm/.info/version 2>/dev/null || echo unknown") + if [[ "${ROCM_VERSION}" == 7.2* ]]; then + echo "[CI-AITER-CHECK] ROCm 7.2 detected (${ROCM_VERSION}), applying AITER hotpatches..." + docker exec ci_sglang bash -c " + cd /sgl-workspace/aiter && \ + TARGET_FILE='aiter/ops/triton/attention/pa_mqa_logits.py' && \ + if [ -f \"\${TARGET_FILE}\" ]; then \ + sed -i '459 s/if.*:/if False:/' \"\${TARGET_FILE}\" && \ + echo '[CI-AITER-CHECK] Hotpatch applied to pa_mqa_logits.py'; \ + else \ + echo '[CI-AITER-CHECK] pa_mqa_logits.py not found, skipping hotpatch'; \ + fi + " + else + echo "[CI-AITER-CHECK] ROCm version=${ROCM_VERSION}, no hotpatch needed" + fi + # build AITER docker exec ci_sglang bash -c " cd /sgl-workspace/aiter && \