From 05455c0ce6ad4b020508ae5aae1e773ac0052f8a Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Sun, 8 Feb 2026 13:56:14 -0800 Subject: [PATCH] Revert "ci: refactor PR tests to hide failed spot jobs from PR status (#2500)" This reverts commit d5eaa429b1c2c3cc51fe078028551fef10ca9cc9. --- .github/workflows/pr-test-runner.yml | 674 -------------------------- .github/workflows/pr-test.yml | 685 ++++++++++++++++++++++----- 2 files changed, 556 insertions(+), 803 deletions(-) delete mode 100644 .github/workflows/pr-test-runner.yml diff --git a/.github/workflows/pr-test-runner.yml b/.github/workflows/pr-test-runner.yml deleted file mode 100644 index 125e9ebb5b..0000000000 --- a/.github/workflows/pr-test-runner.yml +++ /dev/null @@ -1,674 +0,0 @@ -# PR Test Runner - Runs tests and updates check runs. -# Triggered by pr-test.yml via workflow_dispatch. Not visible on PR status. - -name: PR Test Runner - -on: - workflow_dispatch: - inputs: - pr_head_sha: - description: 'PR head SHA for check run updates' - required: true - type: string - docker_tag: - description: 'Docker image tag' - required: true - type: string - aot_check_id: - description: 'AOT Build check run ID' - required: false - type: string - gpu_a10g_check_id: - description: 'GPU A10G check run ID' - required: false - type: string - gpu_t4_check_id: - description: 'GPU T4 check run ID' - required: false - type: string - summary_check_id: - description: 'Test Results Summary check run ID' - required: false - type: string - skip_aot: - description: 'Skip AOT build tests' - required: false - type: string - default: 'false' - skip_gpu: - description: 'Skip GPU tests' - required: false - type: string - default: 'false' - concurrency_key: - description: 'Concurrency group key for cancelling outdated runs' - required: false - type: string - -permissions: - contents: read - actions: read - -concurrency: - group: ${{ inputs.concurrency_key || github.run_id }} - cancel-in-progress: true - -env: - EXECUTOR_NUMBER: "0" - -jobs: - aot-build-import: - name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }}) - if: inputs.skip_aot != 'true' - runs-on: - - self-hosted - - Linux - - ${{ matrix.arch }} - - cpu - - spot - timeout-minutes: 360 - strategy: - fail-fast: true - matrix: - arch: [X64, ARM64] - cuda: [cu126, cu128, cu129, cu130] - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ inputs.docker_tag }} - steps: - - name: Cleanup - run: | - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - - - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha }} - submodules: recursive - - - name: Start spot termination monitor - run: ./scripts/task_monitor_spot.sh & - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh - env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run Test - run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh - - analyze-aot-failure: - name: Analyze AOT Failure - needs: aot-build-import - if: "!cancelled() && inputs.skip_aot != 'true' && (needs.aot-build-import.result == 'failure' || needs.aot-build-import.result == 'cancelled')" - runs-on: ubuntu-latest - outputs: - is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} - rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} - steps: - - name: Analyze failure from job logs - id: analyze - env: - GH_TOKEN: ${{ github.token }} - run: | - RUN_ID="${{ github.run_id }}" - SPOT_TERMINATION=false - FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ - --jq '.jobs[] | select(.name | startswith("AOT")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') - if [ -z "$FAILED_JOBS" ]; then - echo "is_spot_termination=false" >> $GITHUB_OUTPUT - exit 0 - fi - for JOB_ID in $FAILED_JOBS; do - if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - continue - fi - if file job_log.zip | grep -q "Zip archive"; then - unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt - else - mv job_log.zip job_log.txt - fi - if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then - SPOT_TERMINATION=true - break - fi - if grep -qiE "connection reset by peer|context canceled|The operation was canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then - SPOT_TERMINATION=true - break - fi - done - echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT - - - name: Build rerun matrix - id: matrix - if: steps.analyze.outputs.is_spot_termination == 'true' - run: | - MATRIX='{"include":[' - for arch in X64 ARM64; do - for cuda in cu126 cu128 cu129 cu130; do - MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},' - done - done - MATRIX="${MATRIX%,}]}" - echo "rerun_matrix=$MATRIX" >> $GITHUB_OUTPUT - - aot-build-import-rerun: - name: AOT Build Import Rerun (${{ matrix.arch }}, ${{ matrix.cuda }}) - needs: analyze-aot-failure - if: | - !cancelled() && - needs.analyze-aot-failure.outputs.is_spot_termination == 'true' && - needs.analyze-aot-failure.outputs.rerun_matrix != '' - runs-on: - - self-hosted - - Linux - - ${{ matrix.arch }} - - cpu - - on-demand - timeout-minutes: 360 - strategy: - fail-fast: true - matrix: ${{ fromJSON(needs.analyze-aot-failure.outputs.rerun_matrix) }} - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ inputs.docker_tag }} - steps: - - name: Cleanup - run: | - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - - - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha }} - submodules: recursive - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh - env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run Test - run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh - - gpu-tests-a10g: - name: JIT Unittest ${{ matrix.shard }} (A10G) - if: inputs.skip_gpu != 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm86, spot] - timeout-minutes: 360 - strategy: - fail-fast: true - matrix: - shard: [1, 2, 3, 4, 5] - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }} - steps: - - name: Cleanup - run: | - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - nvidia-smi || true - - - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha }} - submodules: recursive - - - name: Start spot termination monitor - run: ./scripts/task_monitor_spot.sh & - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh - env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run JIT Unittest Part ${{ matrix.shard }} - run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh - - analyze-gpu-a10g-failure: - name: Analyze GPU A10G Failure - needs: gpu-tests-a10g - if: "!cancelled() && inputs.skip_gpu != 'true' && (needs.gpu-tests-a10g.result == 'failure' || needs.gpu-tests-a10g.result == 'cancelled')" - runs-on: ubuntu-latest - outputs: - is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} - rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} - steps: - - name: Analyze failure from job logs - id: analyze - env: - GH_TOKEN: ${{ github.token }} - run: | - RUN_ID="${{ github.run_id }}" - SPOT_TERMINATION=false - FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ - --jq '.jobs[] | select(.name | contains("A10G")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') - if [ -z "$FAILED_JOBS" ]; then - echo "is_spot_termination=false" >> $GITHUB_OUTPUT - exit 0 - fi - for JOB_ID in $FAILED_JOBS; do - if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - continue - fi - if file job_log.zip | grep -q "Zip archive"; then - unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt - else - mv job_log.zip job_log.txt - fi - if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then - SPOT_TERMINATION=true - break - fi - if grep -qiE "connection reset by peer|context canceled|The operation was canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then - SPOT_TERMINATION=true - break - fi - done - echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT - - - name: Build rerun matrix - id: matrix - if: steps.analyze.outputs.is_spot_termination == 'true' - run: | - echo 'rerun_matrix={"include":[{"shard":1},{"shard":2},{"shard":3},{"shard":4},{"shard":5}]}' >> $GITHUB_OUTPUT - - gpu-tests-a10g-rerun: - name: JIT Rerun ${{ matrix.shard }} (A10G) - needs: analyze-gpu-a10g-failure - if: | - !cancelled() && - needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' && - needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != '' - runs-on: [self-hosted, Linux, X64, gpu, sm86, on-demand] - timeout-minutes: 360 - strategy: - fail-fast: true - matrix: ${{ fromJSON(needs.analyze-gpu-a10g-failure.outputs.rerun_matrix) }} - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }} - steps: - - name: Cleanup - run: | - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - nvidia-smi || true - - - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha }} - submodules: recursive - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh - env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run JIT Unittest Part ${{ matrix.shard }} - run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh - - gpu-tests-t4: - name: JIT Unittest (T4) - if: inputs.skip_gpu != 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm75, spot] - timeout-minutes: 360 - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }} - steps: - - name: Cleanup - run: | - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - nvidia-smi || true - - - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha }} - submodules: recursive - - - name: Start spot termination monitor - run: ./scripts/task_monitor_spot.sh & - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh - env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run JIT Unittest Part 3 (T4) - run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh - - analyze-gpu-t4-failure: - name: Analyze GPU T4 Failure - needs: gpu-tests-t4 - if: "!cancelled() && inputs.skip_gpu != 'true' && (needs.gpu-tests-t4.result == 'failure' || needs.gpu-tests-t4.result == 'cancelled')" - runs-on: ubuntu-latest - outputs: - is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} - steps: - - name: Analyze failure from job logs - id: analyze - env: - GH_TOKEN: ${{ github.token }} - run: | - RUN_ID="${{ github.run_id }}" - SPOT_TERMINATION=false - FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ - --jq '.jobs[] | select(.name | contains("T4")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') - if [ -z "$FAILED_JOBS" ]; then - echo "is_spot_termination=false" >> $GITHUB_OUTPUT - exit 0 - fi - for JOB_ID in $FAILED_JOBS; do - if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - continue - fi - if file job_log.zip | grep -q "Zip archive"; then - unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt - else - mv job_log.zip job_log.txt - fi - if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then - SPOT_TERMINATION=true - break - fi - if grep -qiE "connection reset by peer|context canceled|The operation was canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then - SPOT_TERMINATION=true - break - fi - done - echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT - - gpu-tests-t4-rerun: - name: JIT Rerun (T4) - needs: analyze-gpu-t4-failure - if: | - !cancelled() && - needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm75, on-demand] - timeout-minutes: 360 - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }} - steps: - - name: Cleanup - run: | - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - nvidia-smi || true - - - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha }} - submodules: recursive - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh - env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run JIT Unittest Part 3 (T4) - run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh - - update-check-runs: - name: Update Check Runs - if: always() - needs: - - aot-build-import - - analyze-aot-failure - - aot-build-import-rerun - - gpu-tests-a10g - - analyze-gpu-a10g-failure - - gpu-tests-a10g-rerun - - gpu-tests-t4 - - analyze-gpu-t4-failure - - gpu-tests-t4-rerun - runs-on: ubuntu-latest - steps: - - name: Generate GitHub App Token - id: app-token - uses: actions/create-github-app-token@v1 - with: - app-id: ${{ secrets.GH_APP_ID }} - private-key: ${{ secrets.GH_APP_KEY }} - owner: flashinfer-ai - repositories: flashinfer - - - name: Update AOT Check Run - if: inputs.aot_check_id != '' && inputs.skip_aot != 'true' - env: - GH_TOKEN: ${{ steps.app-token.outputs.token }} - REPO: ${{ github.repository }} - CHECK_ID: ${{ inputs.aot_check_id }} - AOT_SPOT: ${{ needs.aot-build-import.result }} - AOT_SPOT_TERM: ${{ needs.analyze-aot-failure.outputs.is_spot_termination }} - AOT_RERUN: ${{ needs.aot-build-import-rerun.result }} - run: | - if [ "$AOT_SPOT" == "success" ]; then - CONCLUSION="success" - TITLE="AOT Build Tests Passed" - SUMMARY="All AOT build tests passed on spot instances." - elif [ "$AOT_SPOT_TERM" == "true" ] && [ "$AOT_RERUN" == "success" ]; then - CONCLUSION="success" - TITLE="AOT Build Tests Passed (rerun)" - SUMMARY="Spot instance was terminated. Rerun on on-demand instances passed." - else - CONCLUSION="failure" - TITLE="AOT Build Tests Failed" - SUMMARY="AOT build tests failed. Check the workflow logs for details." - fi - - gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \ - -f status="completed" \ - -f conclusion="$CONCLUSION" \ - -F output[title]="$TITLE" \ - -F output[summary]="$SUMMARY" - - - name: Update GPU A10G Check Run - if: inputs.gpu_a10g_check_id != '' && inputs.skip_gpu != 'true' - env: - GH_TOKEN: ${{ steps.app-token.outputs.token }} - REPO: ${{ github.repository }} - CHECK_ID: ${{ inputs.gpu_a10g_check_id }} - GPU_SPOT: ${{ needs.gpu-tests-a10g.result }} - GPU_SPOT_TERM: ${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }} - GPU_RERUN: ${{ needs.gpu-tests-a10g-rerun.result }} - run: | - if [ "$GPU_SPOT" == "success" ]; then - CONCLUSION="success" - TITLE="JIT Unittest (A10G) Passed" - SUMMARY="All JIT unittest passed on A10G spot instances." - elif [ "$GPU_SPOT_TERM" == "true" ] && [ "$GPU_RERUN" == "success" ]; then - CONCLUSION="success" - TITLE="JIT Unittest (A10G) Passed (rerun)" - SUMMARY="Spot instance was terminated. Rerun on on-demand A10G instances passed." - else - CONCLUSION="failure" - TITLE="JIT Unittest (A10G) Failed" - SUMMARY="JIT unittest on A10G failed. Check the workflow logs for details." - fi - - gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \ - -f status="completed" \ - -f conclusion="$CONCLUSION" \ - -F output[title]="$TITLE" \ - -F output[summary]="$SUMMARY" - - - name: Update GPU T4 Check Run - if: inputs.gpu_t4_check_id != '' && inputs.skip_gpu != 'true' - env: - GH_TOKEN: ${{ steps.app-token.outputs.token }} - REPO: ${{ github.repository }} - CHECK_ID: ${{ inputs.gpu_t4_check_id }} - GPU_SPOT: ${{ needs.gpu-tests-t4.result }} - GPU_SPOT_TERM: ${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }} - GPU_RERUN: ${{ needs.gpu-tests-t4-rerun.result }} - run: | - if [ "$GPU_SPOT" == "success" ]; then - CONCLUSION="success" - TITLE="JIT Unittest (T4) Passed" - SUMMARY="All JIT unittest passed on T4 spot instances." - elif [ "$GPU_SPOT_TERM" == "true" ] && [ "$GPU_RERUN" == "success" ]; then - CONCLUSION="success" - TITLE="JIT Unittest (T4) Passed (rerun)" - SUMMARY="Spot instance was terminated. Rerun on on-demand T4 instances passed." - else - CONCLUSION="failure" - TITLE="JIT Unittest (T4) Failed" - SUMMARY="JIT unittest on T4 failed. Check the workflow logs for details." - fi - - gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \ - -f status="completed" \ - -f conclusion="$CONCLUSION" \ - -F output[title]="$TITLE" \ - -F output[summary]="$SUMMARY" - - - name: Update Test Results Summary - if: inputs.summary_check_id != '' - env: - GH_TOKEN: ${{ steps.app-token.outputs.token }} - REPO: ${{ github.repository }} - CHECK_ID: ${{ inputs.summary_check_id }} - AOT_SPOT: ${{ needs.aot-build-import.result }} - AOT_RERUN: ${{ needs.aot-build-import-rerun.result }} - AOT_SPOT_TERM: ${{ needs.analyze-aot-failure.outputs.is_spot_termination }} - GPU_A10G_SPOT: ${{ needs.gpu-tests-a10g.result }} - GPU_A10G_RERUN: ${{ needs.gpu-tests-a10g-rerun.result }} - GPU_A10G_SPOT_TERM: ${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }} - GPU_T4_SPOT: ${{ needs.gpu-tests-t4.result }} - GPU_T4_RERUN: ${{ needs.gpu-tests-t4-rerun.result }} - GPU_T4_SPOT_TERM: ${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }} - SKIP_AOT: ${{ inputs.skip_aot }} - SKIP_GPU: ${{ inputs.skip_gpu }} - run: | - ALL_PASSED=true - SUMMARY_LINES="" - - if [ "$SKIP_AOT" != "true" ]; then - if [ "$AOT_SPOT" == "success" ]; then - SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Passed\n" - elif [ "$AOT_SPOT_TERM" == "true" ] && [ "$AOT_RERUN" == "success" ]; then - SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Passed (rerun after spot termination)\n" - else - SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Failed\n" - ALL_PASSED=false - fi - else - SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Skipped\n" - fi - - if [ "$SKIP_GPU" != "true" ]; then - if [ "$GPU_A10G_SPOT" == "success" ]; then - SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Passed\n" - elif [ "$GPU_A10G_SPOT_TERM" == "true" ] && [ "$GPU_A10G_RERUN" == "success" ]; then - SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Passed (rerun after spot termination)\n" - else - SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Failed\n" - ALL_PASSED=false - fi - - if [ "$GPU_T4_SPOT" == "success" ]; then - SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Passed\n" - elif [ "$GPU_T4_SPOT_TERM" == "true" ] && [ "$GPU_T4_RERUN" == "success" ]; then - SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Passed (rerun after spot termination)\n" - else - SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Failed\n" - ALL_PASSED=false - fi - else - SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Skipped\n" - SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Skipped\n" - fi - - if [ "$ALL_PASSED" == "true" ]; then - CONCLUSION="success" - TITLE="All tests passed" - else - CONCLUSION="failure" - TITLE="Some tests failed" - fi - - SUMMARY=$(printf '%b' "$SUMMARY_LINES") - - gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \ - -f status="completed" \ - -f conclusion="$CONCLUSION" \ - -F output[title]="$TITLE" \ - -F output[summary]="$SUMMARY" - - echo "Updated Test Results Summary: $CONCLUSION" diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 29ac6a09a2..6e235d5e28 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -1,7 +1,20 @@ -# PR Test Gateway - Creates check runs and triggers test runner +# CI workflow using AWS self-hosted runners. +# Runs AOT build tests and GPU unit tests on push/PR to main. +# Uses ci/bash.sh for Docker execution (same as Jenkins). # -# Creates custom check runs via GitHub App, then dispatches tests to -# pr-test-runner.yml. Failed spot attempts are hidden from PR status. +# Permission Control: +# - Push to main: Always runs +# - PR from org members (ci-users team): Runs automatically +# - PR from external contributors: Requires 'run-ci' label +# (added via @flashinfer-bot run command from authorized user) +# +# Rerun Strategy: +# - Spot jobs run with fail-fast: true +# - Background monitor checks AWS metadata for spot termination notice +# - If termination detected, writes marker to log (captured by GitHub) +# - Analyze job checks logs for marker to decide if should rerun +# - Spot termination: rerun all failed/cancelled jobs on on-demand +# - Real failure: no rerun, workflow fails fast name: PR Test @@ -29,12 +42,15 @@ concurrency: permissions: contents: read pull-requests: write - actions: write + actions: read env: EXECUTOR_NUMBER: "0" jobs: + # --------------------------------------------------------------------------- + # Gate - Check if PR is authorized to run CI + # --------------------------------------------------------------------------- gate: name: Permission Check runs-on: ubuntu-latest @@ -101,6 +117,9 @@ jobs: echo "$AUTHOR is not a member of $TEAM, not authorized" fi + # --------------------------------------------------------------------------- + # Setup - Read docker tag and check if build should be skipped + # --------------------------------------------------------------------------- setup: name: Setup needs: gate @@ -109,21 +128,11 @@ jobs: outputs: docker_tag: ${{ steps.get-tag.outputs.tag }} skip_build: ${{ steps.check.outputs.skip }} - head_sha: ${{ steps.get-sha.outputs.sha }} steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - - name: Get HEAD SHA - id: get-sha - run: | - if [ "${{ github.event_name }}" == "pull_request" ]; then - echo "sha=${{ github.event.pull_request.head.sha }}" >> $GITHUB_OUTPUT - else - echo "sha=${{ github.sha }}" >> $GITHUB_OUTPUT - fi - - name: Get Docker Tag id: get-tag run: | @@ -163,140 +172,558 @@ jobs: echo "::notice::Skipping build - only docs/config files changed" fi - orchestrator: - name: Orchestrate Tests + # --------------------------------------------------------------------------- + # AOT Build Import Tests (Spot + On-Demand Rerun) + # --------------------------------------------------------------------------- + aot-build-import: + name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }}) needs: [gate, setup] if: | needs.gate.outputs.authorized == 'true' && - needs.setup.outputs.skip_build != 'true' - runs-on: ubuntu-latest + needs.setup.outputs.skip_build != 'true' && + github.event.inputs.skip_aot != 'true' + runs-on: + - self-hosted + - Linux + - ${{ matrix.arch }} + - cpu + - spot + timeout-minutes: 360 + strategy: + fail-fast: true + matrix: + arch: [X64, ARM64] + cuda: [cu126, cu128, cu129, cu130] + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }} steps: - - name: Generate Token (flashinfer) - id: flashinfer-token - uses: actions/create-github-app-token@v1 + - name: Cleanup + run: | + # Stop all Docker containers to free memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + + - uses: actions/checkout@v4 with: - app-id: ${{ secrets.GH_APP_ID }} - private-key: ${{ secrets.GH_APP_KEY }} - owner: flashinfer-ai - repositories: flashinfer - - - name: Generate Token (ci-infra) - id: ci-infra-token - uses: actions/create-github-app-token@v1 + submodules: recursive + + - name: Start spot termination monitor + run: ./scripts/task_monitor_spot.sh & + + - name: Login to Docker Hub + uses: docker/login-action@v3 with: - app-id: ${{ secrets.GH_APP_ID }} - private-key: ${{ secrets.GH_APP_KEY }} - owner: flashinfer-ai - repositories: ci-infra - - - name: Create Check Runs (PR only) - id: create-checks - if: github.event_name == 'pull_request' + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run Test + run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh + + analyze-aot-failure: + name: Analyze AOT Failure + needs: [setup, aot-build-import] + if: "!cancelled() && (needs.aot-build-import.result == 'failure' || needs.aot-build-import.result == 'cancelled')" + runs-on: ubuntu-latest + outputs: + is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} + rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} + steps: + - name: Analyze failure from job logs + id: analyze env: - GH_TOKEN: ${{ steps.flashinfer-token.outputs.token }} + GH_TOKEN: ${{ github.token }} run: | - SHA="${{ needs.setup.outputs.head_sha }}" - REPO="${{ github.repository }}" - RUNNER_URL="https://github.com/flashinfer-ai/ci-infra/actions/workflows/pr-test-runner.yml" - - if [ "${{ github.event.inputs.skip_aot }}" != "true" ]; then - AOT_CHECK=$(gh api repos/$REPO/check-runs \ - -f name="AOT Build Tests" \ - -f head_sha="$SHA" \ - -f status="in_progress" \ - -F output[title]="In progress" \ - -F output[summary]="Running AOT build tests: [view test runs]($RUNNER_URL)" \ - --jq '.id') - echo "aot_check_id=$AOT_CHECK" >> $GITHUB_OUTPUT + RUN_ID="${{ github.run_id }}" + SPOT_TERMINATION=false + # Include both failed and cancelled jobs (spot termination can cause either) + FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq '.jobs[] | select(.name | startswith("AOT")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') + if [ -z "$FAILED_JOBS" ]; then + echo "is_spot_termination=false" >> $GITHUB_OUTPUT + exit 0 fi + for JOB_ID in $FAILED_JOBS; do + # Download logs (may be ZIP or plain text depending on GitHub API) + if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then + continue + fi + # Try to unzip if it's a ZIP file, otherwise use as-is + if file job_log.zip | grep -q "Zip archive"; then + unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt + else + mv job_log.zip job_log.txt + fi + if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then + echo "Detected: AWS spot termination marker (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then + echo "Detected: infrastructure error pattern (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + done + echo "is_spot_termination=$SPOT_TERMINATION" + echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT - if [ "${{ github.event.inputs.skip_gpu }}" != "true" ]; then - A10G_CHECK=$(gh api repos/$REPO/check-runs \ - -f name="JIT Unittest (A10G)" \ - -f head_sha="$SHA" \ - -f status="in_progress" \ - -F output[title]="In progress" \ - -F output[summary]="Running JIT unittests on A10G instances: [view test runs]($RUNNER_URL)" \ - --jq '.id') - echo "gpu_a10g_check_id=$A10G_CHECK" >> $GITHUB_OUTPUT - - T4_CHECK=$(gh api repos/$REPO/check-runs \ - -f name="JIT Unittest (T4)" \ - -f head_sha="$SHA" \ - -f status="in_progress" \ - -F output[title]="In progress" \ - -F output[summary]="Running JIT unittests on T4 instances: [view test runs]($RUNNER_URL)" \ - --jq '.id') - echo "gpu_t4_check_id=$T4_CHECK" >> $GITHUB_OUTPUT - fi + - name: Build rerun matrix + id: matrix + if: steps.analyze.outputs.is_spot_termination == 'true' + run: | + MATRIX='{"include":[' + for arch in X64 ARM64; do + for cuda in cu126 cu128 cu129 cu130; do + MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},' + done + done + MATRIX="${MATRIX%,}]}" + echo "rerun_matrix=$MATRIX" >> $GITHUB_OUTPUT + + aot-build-import-rerun: + name: AOT Build Import Rerun (${{ matrix.arch }}, ${{ matrix.cuda }}) + needs: [setup, analyze-aot-failure] + if: | + !cancelled() && + needs.analyze-aot-failure.outputs.is_spot_termination == 'true' && + needs.analyze-aot-failure.outputs.rerun_matrix != '' + runs-on: + - self-hosted + - Linux + - ${{ matrix.arch }} + - cpu + - on-demand + timeout-minutes: 360 + strategy: + fail-fast: true + matrix: ${{ fromJSON(needs.analyze-aot-failure.outputs.rerun_matrix) }} + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }} + steps: + - name: Cleanup + run: | + # Stop all Docker containers to free memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + + - uses: actions/checkout@v4 + with: + submodules: recursive - SUMMARY_CHECK=$(gh api repos/$REPO/check-runs \ - -f name="Test Results Summary" \ - -f head_sha="$SHA" \ - -f status="in_progress" \ - -F output[title]="In progress" \ - -F output[summary]="Waiting for test results: [view test runs]($RUNNER_URL)" \ - --jq '.id') - echo "summary_check_id=$SUMMARY_CHECK" >> $GITHUB_OUTPUT + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true - - name: Trigger Test Runner + - name: Show Node Info + run: ./scripts/task_show_node_info.sh env: - GH_ACTION_TOKEN: ${{ github.token }} - CI_INFRA_TOKEN: ${{ steps.ci-infra-token.outputs.token }} - HEAD_SHA: ${{ needs.setup.outputs.head_sha }} - DOCKER_TAG: ${{ needs.setup.outputs.docker_tag }} - AOT_CHECK_ID: ${{ steps.create-checks.outputs.aot_check_id || '' }} - GPU_A10G_CHECK_ID: ${{ steps.create-checks.outputs.gpu_a10g_check_id || '' }} - GPU_T4_CHECK_ID: ${{ steps.create-checks.outputs.gpu_t4_check_id || '' }} - SUMMARY_CHECK_ID: ${{ steps.create-checks.outputs.summary_check_id || '' }} - SKIP_AOT: ${{ github.event.inputs.skip_aot || 'false' }} - SKIP_GPU: ${{ github.event.inputs.skip_gpu || 'false' }} - CONCURRENCY_KEY: pr-test-${{ github.ref }} - DISPATCH_REF: ${{ github.head_ref || github.ref_name }} + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run Test + run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh + + # --------------------------------------------------------------------------- + # GPU JIT Tests - SM86 (A10G) - Spot + On-Demand Rerun + # --------------------------------------------------------------------------- + gpu-tests-a10g: + name: JIT Unittest ${{ matrix.shard }} (A10G) + needs: [gate, setup] + if: | + needs.gate.outputs.authorized == 'true' && + needs.setup.outputs.skip_build != 'true' && + github.event.inputs.skip_gpu != 'true' + runs-on: [self-hosted, Linux, X64, gpu, sm86, spot] + timeout-minutes: 360 + strategy: + fail-fast: true + matrix: + shard: [1, 2, 3, 4, 5] + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} + steps: + - name: Cleanup run: | - # Try workflow_dispatch first (works after pr-test-runner.yml is on main) - # Uses GITHUB_TOKEN (has actions:write) - App token doesn't have Actions permission - # --ref uses PR branch (to test PR changes to runner) or main (for push) - if GH_TOKEN="$GH_ACTION_TOKEN" gh workflow run pr-test-runner.yml \ - --repo "${{ github.repository }}" \ - --ref "$DISPATCH_REF" \ - -f pr_head_sha="$HEAD_SHA" \ - -f docker_tag="$DOCKER_TAG" \ - -f aot_check_id="$AOT_CHECK_ID" \ - -f gpu_a10g_check_id="$GPU_A10G_CHECK_ID" \ - -f gpu_t4_check_id="$GPU_T4_CHECK_ID" \ - -f summary_check_id="$SUMMARY_CHECK_ID" \ - -f skip_aot="$SKIP_AOT" \ - -f skip_gpu="$SKIP_GPU" \ - -f concurrency_key="$CONCURRENCY_KEY" 2>/dev/null; then - echo "Triggered via workflow_dispatch (flashinfer)" - else - # Fallback: repository_dispatch to ci-infra (bootstrap) - GH_TOKEN="$CI_INFRA_TOKEN" gh api repos/flashinfer-ai/ci-infra/dispatches \ - -f event_type="run-pr-test" \ - -f client_payload[pr_head_sha]="$HEAD_SHA" \ - -f client_payload[docker_tag]="$DOCKER_TAG" \ - -f client_payload[aot_check_id]="$AOT_CHECK_ID" \ - -f client_payload[gpu_a10g_check_id]="$GPU_A10G_CHECK_ID" \ - -f client_payload[gpu_t4_check_id]="$GPU_T4_CHECK_ID" \ - -f client_payload[summary_check_id]="$SUMMARY_CHECK_ID" \ - -f client_payload[skip_aot]="$SKIP_AOT" \ - -f client_payload[skip_gpu]="$SKIP_GPU" \ - -f client_payload[concurrency_key]="$CONCURRENCY_KEY" - echo "Triggered via repository_dispatch (ci-infra bootstrap)" + # Stop all Docker containers to free memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + nvidia-smi || true + + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Start spot termination monitor + run: ./scripts/task_monitor_spot.sh & + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run JIT Unittest Part ${{ matrix.shard }} + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh + + analyze-gpu-a10g-failure: + name: Analyze GPU A10G Failure + needs: [setup, gpu-tests-a10g] + if: "!cancelled() && (needs.gpu-tests-a10g.result == 'failure' || needs.gpu-tests-a10g.result == 'cancelled')" + runs-on: ubuntu-latest + outputs: + is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} + rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} + steps: + - name: Analyze failure from job logs + id: analyze + env: + GH_TOKEN: ${{ github.token }} + run: | + RUN_ID="${{ github.run_id }}" + SPOT_TERMINATION=false + # Include both failed and cancelled jobs (spot termination can cause either) + FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq '.jobs[] | select(.name | contains("A10G")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') + if [ -z "$FAILED_JOBS" ]; then + echo "is_spot_termination=false" >> $GITHUB_OUTPUT + exit 0 fi + for JOB_ID in $FAILED_JOBS; do + # Download logs (may be ZIP or plain text depending on GitHub API) + if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then + continue + fi + # Try to unzip if it's a ZIP file, otherwise use as-is + if file job_log.zip | grep -q "Zip archive"; then + unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt + else + mv job_log.zip job_log.txt + fi + if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then + echo "Detected: AWS spot termination marker (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then + echo "Detected: infrastructure error pattern (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + done + echo "is_spot_termination=$SPOT_TERMINATION" + echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT - report-unauthorized: - name: Report Unauthorized - needs: gate - if: github.event_name == 'pull_request' && needs.gate.outputs.authorized != 'true' + - name: Build rerun matrix + id: matrix + if: steps.analyze.outputs.is_spot_termination == 'true' + run: | + echo 'rerun_matrix={"include":[{"shard":1},{"shard":2},{"shard":3},{"shard":4},{"shard":5}]}' >> $GITHUB_OUTPUT + + gpu-tests-a10g-rerun: + name: JIT Rerun ${{ matrix.shard }} (A10G) + needs: [setup, analyze-gpu-a10g-failure] + if: | + !cancelled() && + needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' && + needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != '' + runs-on: [self-hosted, Linux, X64, gpu, sm86, on-demand] + timeout-minutes: 360 + strategy: + fail-fast: true + matrix: ${{ fromJSON(needs.analyze-gpu-a10g-failure.outputs.rerun_matrix) }} + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} + steps: + - name: Cleanup + run: | + # Stop all Docker containers to free memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + nvidia-smi || true + + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run JIT Unittest Part ${{ matrix.shard }} + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh + + # --------------------------------------------------------------------------- + # GPU JIT Tests - SM75 (T4) - Spot + On-Demand Rerun + # --------------------------------------------------------------------------- + gpu-tests-t4: + name: JIT Unittest (T4) + needs: [gate, setup] + if: | + needs.gate.outputs.authorized == 'true' && + needs.setup.outputs.skip_build != 'true' && + github.event.inputs.skip_gpu != 'true' + runs-on: [self-hosted, Linux, X64, gpu, sm75, spot] + timeout-minutes: 360 + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} + steps: + - name: Cleanup + run: | + # Stop all Docker containers to free memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + nvidia-smi || true + + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Start spot termination monitor + run: ./scripts/task_monitor_spot.sh & + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run JIT Unittest Part 3 (T4) + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh + + analyze-gpu-t4-failure: + name: Analyze GPU T4 Failure + needs: [setup, gpu-tests-t4] + if: "!cancelled() && (needs.gpu-tests-t4.result == 'failure' || needs.gpu-tests-t4.result == 'cancelled')" runs-on: ubuntu-latest + outputs: + is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} steps: - - name: Post Comment + - name: Analyze failure from job logs + id: analyze env: GH_TOKEN: ${{ github.token }} run: | - echo "## CI Authorization Required" >> $GITHUB_STEP_SUMMARY + RUN_ID="${{ github.run_id }}" + SPOT_TERMINATION=false + # Include both failed and cancelled jobs (spot termination can cause either) + FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq '.jobs[] | select(.name | contains("T4")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') + if [ -z "$FAILED_JOBS" ]; then + echo "is_spot_termination=false" >> $GITHUB_OUTPUT + exit 0 + fi + for JOB_ID in $FAILED_JOBS; do + # Download logs (may be ZIP or plain text depending on GitHub API) + if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then + continue + fi + # Try to unzip if it's a ZIP file, otherwise use as-is + if file job_log.zip | grep -q "Zip archive"; then + unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt + else + mv job_log.zip job_log.txt + fi + if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then + echo "Detected: AWS spot termination marker (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then + echo "Detected: infrastructure error pattern (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + done + echo "is_spot_termination=$SPOT_TERMINATION" + echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT + + gpu-tests-t4-rerun: + name: JIT Rerun (T4) + needs: [setup, analyze-gpu-t4-failure] + if: | + !cancelled() && + needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true' + runs-on: [self-hosted, Linux, X64, gpu, sm75, on-demand] + timeout-minutes: 360 + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} + steps: + - name: Cleanup + run: | + # Stop all Docker containers to free memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + nvidia-smi || true + + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run JIT Unittest Part 3 (T4) + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh + + # --------------------------------------------------------------------------- + # Test Results Summary + # --------------------------------------------------------------------------- + test-results-summary: + name: Test Results Summary + if: "!cancelled()" + needs: + - gate + - setup + - aot-build-import + - analyze-aot-failure + - aot-build-import-rerun + - gpu-tests-a10g + - analyze-gpu-a10g-failure + - gpu-tests-a10g-rerun + - gpu-tests-t4 + - analyze-gpu-t4-failure + - gpu-tests-t4-rerun + runs-on: ubuntu-latest + steps: + - name: Check Results + run: | + echo "## Test Results Summary" >> $GITHUB_STEP_SUMMARY + + # Check if CI was skipped due to permissions + if [ "${{ needs.gate.outputs.authorized }}" != "true" ]; then + echo "CI skipped (pending authorization)" >> $GITHUB_STEP_SUMMARY + echo "A contributor in @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY + exit 0 + fi + # Helper function to check job status + check_status() { + local name=$1 skip=$2 spot=$3 spot_term=$4 rerun=$5 + echo "$name" >> $GITHUB_STEP_SUMMARY + if [ "$skip" == "true" ]; then + echo "- Status: Skipped" >> $GITHUB_STEP_SUMMARY + elif [ "$spot" == "success" ]; then + echo "- Status: Passed (spot)" >> $GITHUB_STEP_SUMMARY + elif [ "$spot_term" == "true" ] && [ "$rerun" == "success" ]; then + echo "- Status: Passed (on-demand rerun)" >> $GITHUB_STEP_SUMMARY + else + echo "- Status: Failed" >> $GITHUB_STEP_SUMMARY + return 1 + fi + return 0 + } + + echo "Test Results Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + if [ "${{ needs.setup.outputs.skip_build }}" == "true" ]; then + echo "Build skipped (docs/config only changes)" >> $GITHUB_STEP_SUMMARY + exit 0 + fi + + FAILED=false + + check_status "AOT Build Import Tests" \ + "${{ github.event.inputs.skip_aot }}" \ + "${{ needs.aot-build-import.result }}" \ + "${{ needs.analyze-aot-failure.outputs.is_spot_termination }}" \ + "${{ needs.aot-build-import-rerun.result }}" || FAILED=true + + echo "" >> $GITHUB_STEP_SUMMARY + check_status "GPU Tests (A10G)" \ + "${{ github.event.inputs.skip_gpu }}" \ + "${{ needs.gpu-tests-a10g.result }}" \ + "${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }}" \ + "${{ needs.gpu-tests-a10g-rerun.result }}" || FAILED=true + echo "" >> $GITHUB_STEP_SUMMARY - echo "This PR requires authorization to run CI." >> $GITHUB_STEP_SUMMARY - echo "A member of @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY + check_status "GPU Tests (T4)" \ + "${{ github.event.inputs.skip_gpu }}" \ + "${{ needs.gpu-tests-t4.result }}" \ + "${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \ + "${{ needs.gpu-tests-t4-rerun.result }}" || FAILED=true + + echo "" >> $GITHUB_STEP_SUMMARY + if [ "$FAILED" == "true" ]; then + echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY + exit 1 + fi + echo "Result: Tests Passed" >> $GITHUB_STEP_SUMMARY