diff --git a/.github/workflows/pr-test-runner.yml b/.github/workflows/pr-test-runner.yml new file mode 100644 index 0000000000..125e9ebb5b --- /dev/null +++ b/.github/workflows/pr-test-runner.yml @@ -0,0 +1,674 @@ +# PR Test Runner - Runs tests and updates check runs. +# Triggered by pr-test.yml via workflow_dispatch. Not visible on PR status. + +name: PR Test Runner + +on: + workflow_dispatch: + inputs: + pr_head_sha: + description: 'PR head SHA for check run updates' + required: true + type: string + docker_tag: + description: 'Docker image tag' + required: true + type: string + aot_check_id: + description: 'AOT Build check run ID' + required: false + type: string + gpu_a10g_check_id: + description: 'GPU A10G check run ID' + required: false + type: string + gpu_t4_check_id: + description: 'GPU T4 check run ID' + required: false + type: string + summary_check_id: + description: 'Test Results Summary check run ID' + required: false + type: string + skip_aot: + description: 'Skip AOT build tests' + required: false + type: string + default: 'false' + skip_gpu: + description: 'Skip GPU tests' + required: false + type: string + default: 'false' + concurrency_key: + description: 'Concurrency group key for cancelling outdated runs' + required: false + type: string + +permissions: + contents: read + actions: read + +concurrency: + group: ${{ inputs.concurrency_key || github.run_id }} + cancel-in-progress: true + +env: + EXECUTOR_NUMBER: "0" + +jobs: + aot-build-import: + name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }}) + if: inputs.skip_aot != 'true' + runs-on: + - self-hosted + - Linux + - ${{ matrix.arch }} + - cpu + - spot + timeout-minutes: 360 + strategy: + fail-fast: true + matrix: + arch: [X64, ARM64] + cuda: [cu126, cu128, cu129, cu130] + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ inputs.docker_tag }} + steps: + - name: Cleanup + run: | + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha }} + submodules: recursive + + - name: Start spot termination monitor + run: ./scripts/task_monitor_spot.sh & + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run Test + run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh + + analyze-aot-failure: + name: Analyze AOT Failure + needs: aot-build-import + if: "!cancelled() && inputs.skip_aot != 'true' && (needs.aot-build-import.result == 'failure' || needs.aot-build-import.result == 'cancelled')" + runs-on: ubuntu-latest + outputs: + is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} + rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} + steps: + - name: Analyze failure from job logs + id: analyze + env: + GH_TOKEN: ${{ github.token }} + run: | + RUN_ID="${{ github.run_id }}" + SPOT_TERMINATION=false + FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq '.jobs[] | select(.name | startswith("AOT")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') + if [ -z "$FAILED_JOBS" ]; then + echo "is_spot_termination=false" >> $GITHUB_OUTPUT + exit 0 + fi + for JOB_ID in $FAILED_JOBS; do + if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then + continue + fi + if file job_log.zip | grep -q "Zip archive"; then + unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt + else + mv job_log.zip job_log.txt + fi + if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then + SPOT_TERMINATION=true + break + fi + if grep -qiE "connection reset by peer|context canceled|The operation was canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then + SPOT_TERMINATION=true + break + fi + done + echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT + + - name: Build rerun matrix + id: matrix + if: steps.analyze.outputs.is_spot_termination == 'true' + run: | + MATRIX='{"include":[' + for arch in X64 ARM64; do + for cuda in cu126 cu128 cu129 cu130; do + MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},' + done + done + MATRIX="${MATRIX%,}]}" + echo "rerun_matrix=$MATRIX" >> $GITHUB_OUTPUT + + aot-build-import-rerun: + name: AOT Build Import Rerun (${{ matrix.arch }}, ${{ matrix.cuda }}) + needs: analyze-aot-failure + if: | + !cancelled() && + needs.analyze-aot-failure.outputs.is_spot_termination == 'true' && + needs.analyze-aot-failure.outputs.rerun_matrix != '' + runs-on: + - self-hosted + - Linux + - ${{ matrix.arch }} + - cpu + - on-demand + timeout-minutes: 360 + strategy: + fail-fast: true + matrix: ${{ fromJSON(needs.analyze-aot-failure.outputs.rerun_matrix) }} + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ inputs.docker_tag }} + steps: + - name: Cleanup + run: | + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha }} + submodules: recursive + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run Test + run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh + + gpu-tests-a10g: + name: JIT Unittest ${{ matrix.shard }} (A10G) + if: inputs.skip_gpu != 'true' + runs-on: [self-hosted, Linux, X64, gpu, sm86, spot] + timeout-minutes: 360 + strategy: + fail-fast: true + matrix: + shard: [1, 2, 3, 4, 5] + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }} + steps: + - name: Cleanup + run: | + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + nvidia-smi || true + + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha }} + submodules: recursive + + - name: Start spot termination monitor + run: ./scripts/task_monitor_spot.sh & + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run JIT Unittest Part ${{ matrix.shard }} + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh + + analyze-gpu-a10g-failure: + name: Analyze GPU A10G Failure + needs: gpu-tests-a10g + if: "!cancelled() && inputs.skip_gpu != 'true' && (needs.gpu-tests-a10g.result == 'failure' || needs.gpu-tests-a10g.result == 'cancelled')" + runs-on: ubuntu-latest + outputs: + is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} + rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} + steps: + - name: Analyze failure from job logs + id: analyze + env: + GH_TOKEN: ${{ github.token }} + run: | + RUN_ID="${{ github.run_id }}" + SPOT_TERMINATION=false + FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq '.jobs[] | select(.name | contains("A10G")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') + if [ -z "$FAILED_JOBS" ]; then + echo "is_spot_termination=false" >> $GITHUB_OUTPUT + exit 0 + fi + for JOB_ID in $FAILED_JOBS; do + if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then + continue + fi + if file job_log.zip | grep -q "Zip archive"; then + unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt + else + mv job_log.zip job_log.txt + fi + if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then + SPOT_TERMINATION=true + break + fi + if grep -qiE "connection reset by peer|context canceled|The operation was canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then + SPOT_TERMINATION=true + break + fi + done + echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT + + - name: Build rerun matrix + id: matrix + if: steps.analyze.outputs.is_spot_termination == 'true' + run: | + echo 'rerun_matrix={"include":[{"shard":1},{"shard":2},{"shard":3},{"shard":4},{"shard":5}]}' >> $GITHUB_OUTPUT + + gpu-tests-a10g-rerun: + name: JIT Rerun ${{ matrix.shard }} (A10G) + needs: analyze-gpu-a10g-failure + if: | + !cancelled() && + needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' && + needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != '' + runs-on: [self-hosted, Linux, X64, gpu, sm86, on-demand] + timeout-minutes: 360 + strategy: + fail-fast: true + matrix: ${{ fromJSON(needs.analyze-gpu-a10g-failure.outputs.rerun_matrix) }} + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }} + steps: + - name: Cleanup + run: | + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + nvidia-smi || true + + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha }} + submodules: recursive + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run JIT Unittest Part ${{ matrix.shard }} + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh + + gpu-tests-t4: + name: JIT Unittest (T4) + if: inputs.skip_gpu != 'true' + runs-on: [self-hosted, Linux, X64, gpu, sm75, spot] + timeout-minutes: 360 + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }} + steps: + - name: Cleanup + run: | + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + nvidia-smi || true + + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha }} + submodules: recursive + + - name: Start spot termination monitor + run: ./scripts/task_monitor_spot.sh & + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run JIT Unittest Part 3 (T4) + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh + + analyze-gpu-t4-failure: + name: Analyze GPU T4 Failure + needs: gpu-tests-t4 + if: "!cancelled() && inputs.skip_gpu != 'true' && (needs.gpu-tests-t4.result == 'failure' || needs.gpu-tests-t4.result == 'cancelled')" + runs-on: ubuntu-latest + outputs: + is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} + steps: + - name: Analyze failure from job logs + id: analyze + env: + GH_TOKEN: ${{ github.token }} + run: | + RUN_ID="${{ github.run_id }}" + SPOT_TERMINATION=false + FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq '.jobs[] | select(.name | contains("T4")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') + if [ -z "$FAILED_JOBS" ]; then + echo "is_spot_termination=false" >> $GITHUB_OUTPUT + exit 0 + fi + for JOB_ID in $FAILED_JOBS; do + if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then + continue + fi + if file job_log.zip | grep -q "Zip archive"; then + unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt + else + mv job_log.zip job_log.txt + fi + if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then + SPOT_TERMINATION=true + break + fi + if grep -qiE "connection reset by peer|context canceled|The operation was canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then + SPOT_TERMINATION=true + break + fi + done + echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT + + gpu-tests-t4-rerun: + name: JIT Rerun (T4) + needs: analyze-gpu-t4-failure + if: | + !cancelled() && + needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true' + runs-on: [self-hosted, Linux, X64, gpu, sm75, on-demand] + timeout-minutes: 360 + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }} + steps: + - name: Cleanup + run: | + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + nvidia-smi || true + + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha }} + submodules: recursive + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run JIT Unittest Part 3 (T4) + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh + + update-check-runs: + name: Update Check Runs + if: always() + needs: + - aot-build-import + - analyze-aot-failure + - aot-build-import-rerun + - gpu-tests-a10g + - analyze-gpu-a10g-failure + - gpu-tests-a10g-rerun + - gpu-tests-t4 + - analyze-gpu-t4-failure + - gpu-tests-t4-rerun + runs-on: ubuntu-latest + steps: + - name: Generate GitHub App Token + id: app-token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ secrets.GH_APP_ID }} + private-key: ${{ secrets.GH_APP_KEY }} + owner: flashinfer-ai + repositories: flashinfer + + - name: Update AOT Check Run + if: inputs.aot_check_id != '' && inputs.skip_aot != 'true' + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + REPO: ${{ github.repository }} + CHECK_ID: ${{ inputs.aot_check_id }} + AOT_SPOT: ${{ needs.aot-build-import.result }} + AOT_SPOT_TERM: ${{ needs.analyze-aot-failure.outputs.is_spot_termination }} + AOT_RERUN: ${{ needs.aot-build-import-rerun.result }} + run: | + if [ "$AOT_SPOT" == "success" ]; then + CONCLUSION="success" + TITLE="AOT Build Tests Passed" + SUMMARY="All AOT build tests passed on spot instances." + elif [ "$AOT_SPOT_TERM" == "true" ] && [ "$AOT_RERUN" == "success" ]; then + CONCLUSION="success" + TITLE="AOT Build Tests Passed (rerun)" + SUMMARY="Spot instance was terminated. Rerun on on-demand instances passed." + else + CONCLUSION="failure" + TITLE="AOT Build Tests Failed" + SUMMARY="AOT build tests failed. Check the workflow logs for details." + fi + + gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \ + -f status="completed" \ + -f conclusion="$CONCLUSION" \ + -F output[title]="$TITLE" \ + -F output[summary]="$SUMMARY" + + - name: Update GPU A10G Check Run + if: inputs.gpu_a10g_check_id != '' && inputs.skip_gpu != 'true' + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + REPO: ${{ github.repository }} + CHECK_ID: ${{ inputs.gpu_a10g_check_id }} + GPU_SPOT: ${{ needs.gpu-tests-a10g.result }} + GPU_SPOT_TERM: ${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }} + GPU_RERUN: ${{ needs.gpu-tests-a10g-rerun.result }} + run: | + if [ "$GPU_SPOT" == "success" ]; then + CONCLUSION="success" + TITLE="JIT Unittest (A10G) Passed" + SUMMARY="All JIT unittest passed on A10G spot instances." + elif [ "$GPU_SPOT_TERM" == "true" ] && [ "$GPU_RERUN" == "success" ]; then + CONCLUSION="success" + TITLE="JIT Unittest (A10G) Passed (rerun)" + SUMMARY="Spot instance was terminated. Rerun on on-demand A10G instances passed." + else + CONCLUSION="failure" + TITLE="JIT Unittest (A10G) Failed" + SUMMARY="JIT unittest on A10G failed. Check the workflow logs for details." + fi + + gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \ + -f status="completed" \ + -f conclusion="$CONCLUSION" \ + -F output[title]="$TITLE" \ + -F output[summary]="$SUMMARY" + + - name: Update GPU T4 Check Run + if: inputs.gpu_t4_check_id != '' && inputs.skip_gpu != 'true' + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + REPO: ${{ github.repository }} + CHECK_ID: ${{ inputs.gpu_t4_check_id }} + GPU_SPOT: ${{ needs.gpu-tests-t4.result }} + GPU_SPOT_TERM: ${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }} + GPU_RERUN: ${{ needs.gpu-tests-t4-rerun.result }} + run: | + if [ "$GPU_SPOT" == "success" ]; then + CONCLUSION="success" + TITLE="JIT Unittest (T4) Passed" + SUMMARY="All JIT unittest passed on T4 spot instances." + elif [ "$GPU_SPOT_TERM" == "true" ] && [ "$GPU_RERUN" == "success" ]; then + CONCLUSION="success" + TITLE="JIT Unittest (T4) Passed (rerun)" + SUMMARY="Spot instance was terminated. Rerun on on-demand T4 instances passed." + else + CONCLUSION="failure" + TITLE="JIT Unittest (T4) Failed" + SUMMARY="JIT unittest on T4 failed. Check the workflow logs for details." + fi + + gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \ + -f status="completed" \ + -f conclusion="$CONCLUSION" \ + -F output[title]="$TITLE" \ + -F output[summary]="$SUMMARY" + + - name: Update Test Results Summary + if: inputs.summary_check_id != '' + env: + GH_TOKEN: ${{ steps.app-token.outputs.token }} + REPO: ${{ github.repository }} + CHECK_ID: ${{ inputs.summary_check_id }} + AOT_SPOT: ${{ needs.aot-build-import.result }} + AOT_RERUN: ${{ needs.aot-build-import-rerun.result }} + AOT_SPOT_TERM: ${{ needs.analyze-aot-failure.outputs.is_spot_termination }} + GPU_A10G_SPOT: ${{ needs.gpu-tests-a10g.result }} + GPU_A10G_RERUN: ${{ needs.gpu-tests-a10g-rerun.result }} + GPU_A10G_SPOT_TERM: ${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }} + GPU_T4_SPOT: ${{ needs.gpu-tests-t4.result }} + GPU_T4_RERUN: ${{ needs.gpu-tests-t4-rerun.result }} + GPU_T4_SPOT_TERM: ${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }} + SKIP_AOT: ${{ inputs.skip_aot }} + SKIP_GPU: ${{ inputs.skip_gpu }} + run: | + ALL_PASSED=true + SUMMARY_LINES="" + + if [ "$SKIP_AOT" != "true" ]; then + if [ "$AOT_SPOT" == "success" ]; then + SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Passed\n" + elif [ "$AOT_SPOT_TERM" == "true" ] && [ "$AOT_RERUN" == "success" ]; then + SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Passed (rerun after spot termination)\n" + else + SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Failed\n" + ALL_PASSED=false + fi + else + SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Skipped\n" + fi + + if [ "$SKIP_GPU" != "true" ]; then + if [ "$GPU_A10G_SPOT" == "success" ]; then + SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Passed\n" + elif [ "$GPU_A10G_SPOT_TERM" == "true" ] && [ "$GPU_A10G_RERUN" == "success" ]; then + SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Passed (rerun after spot termination)\n" + else + SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Failed\n" + ALL_PASSED=false + fi + + if [ "$GPU_T4_SPOT" == "success" ]; then + SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Passed\n" + elif [ "$GPU_T4_SPOT_TERM" == "true" ] && [ "$GPU_T4_RERUN" == "success" ]; then + SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Passed (rerun after spot termination)\n" + else + SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Failed\n" + ALL_PASSED=false + fi + else + SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Skipped\n" + SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Skipped\n" + fi + + if [ "$ALL_PASSED" == "true" ]; then + CONCLUSION="success" + TITLE="All tests passed" + else + CONCLUSION="failure" + TITLE="Some tests failed" + fi + + SUMMARY=$(printf '%b' "$SUMMARY_LINES") + + gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \ + -f status="completed" \ + -f conclusion="$CONCLUSION" \ + -F output[title]="$TITLE" \ + -F output[summary]="$SUMMARY" + + echo "Updated Test Results Summary: $CONCLUSION" diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 6e235d5e28..29ac6a09a2 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -1,20 +1,7 @@ -# CI workflow using AWS self-hosted runners. -# Runs AOT build tests and GPU unit tests on push/PR to main. -# Uses ci/bash.sh for Docker execution (same as Jenkins). +# PR Test Gateway - Creates check runs and triggers test runner # -# Permission Control: -# - Push to main: Always runs -# - PR from org members (ci-users team): Runs automatically -# - PR from external contributors: Requires 'run-ci' label -# (added via @flashinfer-bot run command from authorized user) -# -# Rerun Strategy: -# - Spot jobs run with fail-fast: true -# - Background monitor checks AWS metadata for spot termination notice -# - If termination detected, writes marker to log (captured by GitHub) -# - Analyze job checks logs for marker to decide if should rerun -# - Spot termination: rerun all failed/cancelled jobs on on-demand -# - Real failure: no rerun, workflow fails fast +# Creates custom check runs via GitHub App, then dispatches tests to +# pr-test-runner.yml. Failed spot attempts are hidden from PR status. name: PR Test @@ -42,15 +29,12 @@ concurrency: permissions: contents: read pull-requests: write - actions: read + actions: write env: EXECUTOR_NUMBER: "0" jobs: - # --------------------------------------------------------------------------- - # Gate - Check if PR is authorized to run CI - # --------------------------------------------------------------------------- gate: name: Permission Check runs-on: ubuntu-latest @@ -117,9 +101,6 @@ jobs: echo "$AUTHOR is not a member of $TEAM, not authorized" fi - # --------------------------------------------------------------------------- - # Setup - Read docker tag and check if build should be skipped - # --------------------------------------------------------------------------- setup: name: Setup needs: gate @@ -128,11 +109,21 @@ jobs: outputs: docker_tag: ${{ steps.get-tag.outputs.tag }} skip_build: ${{ steps.check.outputs.skip }} + head_sha: ${{ steps.get-sha.outputs.sha }} steps: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Get HEAD SHA + id: get-sha + run: | + if [ "${{ github.event_name }}" == "pull_request" ]; then + echo "sha=${{ github.event.pull_request.head.sha }}" >> $GITHUB_OUTPUT + else + echo "sha=${{ github.sha }}" >> $GITHUB_OUTPUT + fi + - name: Get Docker Tag id: get-tag run: | @@ -172,558 +163,140 @@ jobs: echo "::notice::Skipping build - only docs/config files changed" fi - # --------------------------------------------------------------------------- - # AOT Build Import Tests (Spot + On-Demand Rerun) - # --------------------------------------------------------------------------- - aot-build-import: - name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }}) + orchestrator: + name: Orchestrate Tests needs: [gate, setup] if: | needs.gate.outputs.authorized == 'true' && - needs.setup.outputs.skip_build != 'true' && - github.event.inputs.skip_aot != 'true' - runs-on: - - self-hosted - - Linux - - ${{ matrix.arch }} - - cpu - - spot - timeout-minutes: 360 - strategy: - fail-fast: true - matrix: - arch: [X64, ARM64] - cuda: [cu126, cu128, cu129, cu130] - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }} - steps: - - name: Cleanup - run: | - # Stop all Docker containers to free memory - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - # Clean workspace and caches - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Start spot termination monitor - run: ./scripts/task_monitor_spot.sh & - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh - env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run Test - run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh - - analyze-aot-failure: - name: Analyze AOT Failure - needs: [setup, aot-build-import] - if: "!cancelled() && (needs.aot-build-import.result == 'failure' || needs.aot-build-import.result == 'cancelled')" + needs.setup.outputs.skip_build != 'true' runs-on: ubuntu-latest - outputs: - is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} - rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} - steps: - - name: Analyze failure from job logs - id: analyze - env: - GH_TOKEN: ${{ github.token }} - run: | - RUN_ID="${{ github.run_id }}" - SPOT_TERMINATION=false - # Include both failed and cancelled jobs (spot termination can cause either) - FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ - --jq '.jobs[] | select(.name | startswith("AOT")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') - if [ -z "$FAILED_JOBS" ]; then - echo "is_spot_termination=false" >> $GITHUB_OUTPUT - exit 0 - fi - for JOB_ID in $FAILED_JOBS; do - # Download logs (may be ZIP or plain text depending on GitHub API) - if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - continue - fi - # Try to unzip if it's a ZIP file, otherwise use as-is - if file job_log.zip | grep -q "Zip archive"; then - unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt - else - mv job_log.zip job_log.txt - fi - if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then - echo "Detected: AWS spot termination marker (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then - echo "Detected: infrastructure error pattern (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - done - echo "is_spot_termination=$SPOT_TERMINATION" - echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT - - - name: Build rerun matrix - id: matrix - if: steps.analyze.outputs.is_spot_termination == 'true' - run: | - MATRIX='{"include":[' - for arch in X64 ARM64; do - for cuda in cu126 cu128 cu129 cu130; do - MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},' - done - done - MATRIX="${MATRIX%,}]}" - echo "rerun_matrix=$MATRIX" >> $GITHUB_OUTPUT - - aot-build-import-rerun: - name: AOT Build Import Rerun (${{ matrix.arch }}, ${{ matrix.cuda }}) - needs: [setup, analyze-aot-failure] - if: | - !cancelled() && - needs.analyze-aot-failure.outputs.is_spot_termination == 'true' && - needs.analyze-aot-failure.outputs.rerun_matrix != '' - runs-on: - - self-hosted - - Linux - - ${{ matrix.arch }} - - cpu - - on-demand - timeout-minutes: 360 - strategy: - fail-fast: true - matrix: ${{ fromJSON(needs.analyze-aot-failure.outputs.rerun_matrix) }} - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }} - steps: - - name: Cleanup - run: | - # Stop all Docker containers to free memory - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - # Clean workspace and caches - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh - env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run Test - run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh - - # --------------------------------------------------------------------------- - # GPU JIT Tests - SM86 (A10G) - Spot + On-Demand Rerun - # --------------------------------------------------------------------------- - gpu-tests-a10g: - name: JIT Unittest ${{ matrix.shard }} (A10G) - needs: [gate, setup] - if: | - needs.gate.outputs.authorized == 'true' && - needs.setup.outputs.skip_build != 'true' && - github.event.inputs.skip_gpu != 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm86, spot] - timeout-minutes: 360 - strategy: - fail-fast: true - matrix: - shard: [1, 2, 3, 4, 5] - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} steps: - - name: Cleanup - run: | - # Stop all Docker containers to free memory - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - # Clean workspace and caches - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - nvidia-smi || true - - - uses: actions/checkout@v4 + - name: Generate Token (flashinfer) + id: flashinfer-token + uses: actions/create-github-app-token@v1 with: - submodules: recursive - - - name: Start spot termination monitor - run: ./scripts/task_monitor_spot.sh & - - - name: Login to Docker Hub - uses: docker/login-action@v3 + app-id: ${{ secrets.GH_APP_ID }} + private-key: ${{ secrets.GH_APP_KEY }} + owner: flashinfer-ai + repositories: flashinfer + + - name: Generate Token (ci-infra) + id: ci-infra-token + uses: actions/create-github-app-token@v1 with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh + app-id: ${{ secrets.GH_APP_ID }} + private-key: ${{ secrets.GH_APP_KEY }} + owner: flashinfer-ai + repositories: ci-infra + + - name: Create Check Runs (PR only) + id: create-checks + if: github.event_name == 'pull_request' env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run JIT Unittest Part ${{ matrix.shard }} - run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh - - analyze-gpu-a10g-failure: - name: Analyze GPU A10G Failure - needs: [setup, gpu-tests-a10g] - if: "!cancelled() && (needs.gpu-tests-a10g.result == 'failure' || needs.gpu-tests-a10g.result == 'cancelled')" - runs-on: ubuntu-latest - outputs: - is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} - rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} - steps: - - name: Analyze failure from job logs - id: analyze - env: - GH_TOKEN: ${{ github.token }} + GH_TOKEN: ${{ steps.flashinfer-token.outputs.token }} run: | - RUN_ID="${{ github.run_id }}" - SPOT_TERMINATION=false - # Include both failed and cancelled jobs (spot termination can cause either) - FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ - --jq '.jobs[] | select(.name | contains("A10G")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') - if [ -z "$FAILED_JOBS" ]; then - echo "is_spot_termination=false" >> $GITHUB_OUTPUT - exit 0 + SHA="${{ needs.setup.outputs.head_sha }}" + REPO="${{ github.repository }}" + RUNNER_URL="https://github.com/flashinfer-ai/ci-infra/actions/workflows/pr-test-runner.yml" + + if [ "${{ github.event.inputs.skip_aot }}" != "true" ]; then + AOT_CHECK=$(gh api repos/$REPO/check-runs \ + -f name="AOT Build Tests" \ + -f head_sha="$SHA" \ + -f status="in_progress" \ + -F output[title]="In progress" \ + -F output[summary]="Running AOT build tests: [view test runs]($RUNNER_URL)" \ + --jq '.id') + echo "aot_check_id=$AOT_CHECK" >> $GITHUB_OUTPUT fi - for JOB_ID in $FAILED_JOBS; do - # Download logs (may be ZIP or plain text depending on GitHub API) - if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - continue - fi - # Try to unzip if it's a ZIP file, otherwise use as-is - if file job_log.zip | grep -q "Zip archive"; then - unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt - else - mv job_log.zip job_log.txt - fi - if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then - echo "Detected: AWS spot termination marker (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then - echo "Detected: infrastructure error pattern (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - done - echo "is_spot_termination=$SPOT_TERMINATION" - echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT - - - name: Build rerun matrix - id: matrix - if: steps.analyze.outputs.is_spot_termination == 'true' - run: | - echo 'rerun_matrix={"include":[{"shard":1},{"shard":2},{"shard":3},{"shard":4},{"shard":5}]}' >> $GITHUB_OUTPUT - gpu-tests-a10g-rerun: - name: JIT Rerun ${{ matrix.shard }} (A10G) - needs: [setup, analyze-gpu-a10g-failure] - if: | - !cancelled() && - needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' && - needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != '' - runs-on: [self-hosted, Linux, X64, gpu, sm86, on-demand] - timeout-minutes: 360 - strategy: - fail-fast: true - matrix: ${{ fromJSON(needs.analyze-gpu-a10g-failure.outputs.rerun_matrix) }} - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} - steps: - - name: Cleanup - run: | - # Stop all Docker containers to free memory - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - # Clean workspace and caches - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - nvidia-smi || true - - - uses: actions/checkout@v4 - with: - submodules: recursive + if [ "${{ github.event.inputs.skip_gpu }}" != "true" ]; then + A10G_CHECK=$(gh api repos/$REPO/check-runs \ + -f name="JIT Unittest (A10G)" \ + -f head_sha="$SHA" \ + -f status="in_progress" \ + -F output[title]="In progress" \ + -F output[summary]="Running JIT unittests on A10G instances: [view test runs]($RUNNER_URL)" \ + --jq '.id') + echo "gpu_a10g_check_id=$A10G_CHECK" >> $GITHUB_OUTPUT + + T4_CHECK=$(gh api repos/$REPO/check-runs \ + -f name="JIT Unittest (T4)" \ + -f head_sha="$SHA" \ + -f status="in_progress" \ + -F output[title]="In progress" \ + -F output[summary]="Running JIT unittests on T4 instances: [view test runs]($RUNNER_URL)" \ + --jq '.id') + echo "gpu_t4_check_id=$T4_CHECK" >> $GITHUB_OUTPUT + fi - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true + SUMMARY_CHECK=$(gh api repos/$REPO/check-runs \ + -f name="Test Results Summary" \ + -f head_sha="$SHA" \ + -f status="in_progress" \ + -F output[title]="In progress" \ + -F output[summary]="Waiting for test results: [view test runs]($RUNNER_URL)" \ + --jq '.id') + echo "summary_check_id=$SUMMARY_CHECK" >> $GITHUB_OUTPUT - - name: Show Node Info - run: ./scripts/task_show_node_info.sh + - name: Trigger Test Runner env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run JIT Unittest Part ${{ matrix.shard }} - run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh - - # --------------------------------------------------------------------------- - # GPU JIT Tests - SM75 (T4) - Spot + On-Demand Rerun - # --------------------------------------------------------------------------- - gpu-tests-t4: - name: JIT Unittest (T4) - needs: [gate, setup] - if: | - needs.gate.outputs.authorized == 'true' && - needs.setup.outputs.skip_build != 'true' && - github.event.inputs.skip_gpu != 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm75, spot] - timeout-minutes: 360 - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} - steps: - - name: Cleanup + GH_ACTION_TOKEN: ${{ github.token }} + CI_INFRA_TOKEN: ${{ steps.ci-infra-token.outputs.token }} + HEAD_SHA: ${{ needs.setup.outputs.head_sha }} + DOCKER_TAG: ${{ needs.setup.outputs.docker_tag }} + AOT_CHECK_ID: ${{ steps.create-checks.outputs.aot_check_id || '' }} + GPU_A10G_CHECK_ID: ${{ steps.create-checks.outputs.gpu_a10g_check_id || '' }} + GPU_T4_CHECK_ID: ${{ steps.create-checks.outputs.gpu_t4_check_id || '' }} + SUMMARY_CHECK_ID: ${{ steps.create-checks.outputs.summary_check_id || '' }} + SKIP_AOT: ${{ github.event.inputs.skip_aot || 'false' }} + SKIP_GPU: ${{ github.event.inputs.skip_gpu || 'false' }} + CONCURRENCY_KEY: pr-test-${{ github.ref }} + DISPATCH_REF: ${{ github.head_ref || github.ref_name }} run: | - # Stop all Docker containers to free memory - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - # Clean workspace and caches - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - nvidia-smi || true - - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Start spot termination monitor - run: ./scripts/task_monitor_spot.sh & - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh - env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run JIT Unittest Part 3 (T4) - run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh + # Try workflow_dispatch first (works after pr-test-runner.yml is on main) + # Uses GITHUB_TOKEN (has actions:write) - App token doesn't have Actions permission + # --ref uses PR branch (to test PR changes to runner) or main (for push) + if GH_TOKEN="$GH_ACTION_TOKEN" gh workflow run pr-test-runner.yml \ + --repo "${{ github.repository }}" \ + --ref "$DISPATCH_REF" \ + -f pr_head_sha="$HEAD_SHA" \ + -f docker_tag="$DOCKER_TAG" \ + -f aot_check_id="$AOT_CHECK_ID" \ + -f gpu_a10g_check_id="$GPU_A10G_CHECK_ID" \ + -f gpu_t4_check_id="$GPU_T4_CHECK_ID" \ + -f summary_check_id="$SUMMARY_CHECK_ID" \ + -f skip_aot="$SKIP_AOT" \ + -f skip_gpu="$SKIP_GPU" \ + -f concurrency_key="$CONCURRENCY_KEY" 2>/dev/null; then + echo "Triggered via workflow_dispatch (flashinfer)" + else + # Fallback: repository_dispatch to ci-infra (bootstrap) + GH_TOKEN="$CI_INFRA_TOKEN" gh api repos/flashinfer-ai/ci-infra/dispatches \ + -f event_type="run-pr-test" \ + -f client_payload[pr_head_sha]="$HEAD_SHA" \ + -f client_payload[docker_tag]="$DOCKER_TAG" \ + -f client_payload[aot_check_id]="$AOT_CHECK_ID" \ + -f client_payload[gpu_a10g_check_id]="$GPU_A10G_CHECK_ID" \ + -f client_payload[gpu_t4_check_id]="$GPU_T4_CHECK_ID" \ + -f client_payload[summary_check_id]="$SUMMARY_CHECK_ID" \ + -f client_payload[skip_aot]="$SKIP_AOT" \ + -f client_payload[skip_gpu]="$SKIP_GPU" \ + -f client_payload[concurrency_key]="$CONCURRENCY_KEY" + echo "Triggered via repository_dispatch (ci-infra bootstrap)" + fi - analyze-gpu-t4-failure: - name: Analyze GPU T4 Failure - needs: [setup, gpu-tests-t4] - if: "!cancelled() && (needs.gpu-tests-t4.result == 'failure' || needs.gpu-tests-t4.result == 'cancelled')" + report-unauthorized: + name: Report Unauthorized + needs: gate + if: github.event_name == 'pull_request' && needs.gate.outputs.authorized != 'true' runs-on: ubuntu-latest - outputs: - is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} steps: - - name: Analyze failure from job logs - id: analyze + - name: Post Comment env: GH_TOKEN: ${{ github.token }} run: | - RUN_ID="${{ github.run_id }}" - SPOT_TERMINATION=false - # Include both failed and cancelled jobs (spot termination can cause either) - FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ - --jq '.jobs[] | select(.name | contains("T4")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') - if [ -z "$FAILED_JOBS" ]; then - echo "is_spot_termination=false" >> $GITHUB_OUTPUT - exit 0 - fi - for JOB_ID in $FAILED_JOBS; do - # Download logs (may be ZIP or plain text depending on GitHub API) - if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - continue - fi - # Try to unzip if it's a ZIP file, otherwise use as-is - if file job_log.zip | grep -q "Zip archive"; then - unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt - else - mv job_log.zip job_log.txt - fi - if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then - echo "Detected: AWS spot termination marker (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then - echo "Detected: infrastructure error pattern (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - done - echo "is_spot_termination=$SPOT_TERMINATION" - echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT - - gpu-tests-t4-rerun: - name: JIT Rerun (T4) - needs: [setup, analyze-gpu-t4-failure] - if: | - !cancelled() && - needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm75, on-demand] - timeout-minutes: 360 - env: - DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} - steps: - - name: Cleanup - run: | - # Stop all Docker containers to free memory - docker stop $(docker ps -q) 2>/dev/null || true - docker rm $(docker ps -aq) 2>/dev/null || true - # Clean workspace and caches - sudo rm -rf ${{ github.workspace }}/* || true - sudo rm -rf ${{ github.workspace }}/.[!.]* || true - rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true - nvidia-smi || true - - - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - - - name: Show Node Info - run: ./scripts/task_show_node_info.sh - env: - NODE_NAME: ${{ runner.name }} - WORKSPACE: ${{ github.workspace }} - BUILD_NUMBER: ${{ github.run_number }} - - - name: Run JIT Unittest Part 3 (T4) - run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh - - # --------------------------------------------------------------------------- - # Test Results Summary - # --------------------------------------------------------------------------- - test-results-summary: - name: Test Results Summary - if: "!cancelled()" - needs: - - gate - - setup - - aot-build-import - - analyze-aot-failure - - aot-build-import-rerun - - gpu-tests-a10g - - analyze-gpu-a10g-failure - - gpu-tests-a10g-rerun - - gpu-tests-t4 - - analyze-gpu-t4-failure - - gpu-tests-t4-rerun - runs-on: ubuntu-latest - steps: - - name: Check Results - run: | - echo "## Test Results Summary" >> $GITHUB_STEP_SUMMARY - - # Check if CI was skipped due to permissions - if [ "${{ needs.gate.outputs.authorized }}" != "true" ]; then - echo "CI skipped (pending authorization)" >> $GITHUB_STEP_SUMMARY - echo "A contributor in @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY - exit 0 - fi - # Helper function to check job status - check_status() { - local name=$1 skip=$2 spot=$3 spot_term=$4 rerun=$5 - echo "$name" >> $GITHUB_STEP_SUMMARY - if [ "$skip" == "true" ]; then - echo "- Status: Skipped" >> $GITHUB_STEP_SUMMARY - elif [ "$spot" == "success" ]; then - echo "- Status: Passed (spot)" >> $GITHUB_STEP_SUMMARY - elif [ "$spot_term" == "true" ] && [ "$rerun" == "success" ]; then - echo "- Status: Passed (on-demand rerun)" >> $GITHUB_STEP_SUMMARY - else - echo "- Status: Failed" >> $GITHUB_STEP_SUMMARY - return 1 - fi - return 0 - } - - echo "Test Results Summary" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - - if [ "${{ needs.setup.outputs.skip_build }}" == "true" ]; then - echo "Build skipped (docs/config only changes)" >> $GITHUB_STEP_SUMMARY - exit 0 - fi - - FAILED=false - - check_status "AOT Build Import Tests" \ - "${{ github.event.inputs.skip_aot }}" \ - "${{ needs.aot-build-import.result }}" \ - "${{ needs.analyze-aot-failure.outputs.is_spot_termination }}" \ - "${{ needs.aot-build-import-rerun.result }}" || FAILED=true - - echo "" >> $GITHUB_STEP_SUMMARY - check_status "GPU Tests (A10G)" \ - "${{ github.event.inputs.skip_gpu }}" \ - "${{ needs.gpu-tests-a10g.result }}" \ - "${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }}" \ - "${{ needs.gpu-tests-a10g-rerun.result }}" || FAILED=true - + echo "## CI Authorization Required" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - check_status "GPU Tests (T4)" \ - "${{ github.event.inputs.skip_gpu }}" \ - "${{ needs.gpu-tests-t4.result }}" \ - "${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \ - "${{ needs.gpu-tests-t4-rerun.result }}" || FAILED=true - - echo "" >> $GITHUB_STEP_SUMMARY - if [ "$FAILED" == "true" ]; then - echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY - exit 1 - fi - echo "Result: Tests Passed" >> $GITHUB_STEP_SUMMARY + echo "This PR requires authorization to run CI." >> $GITHUB_STEP_SUMMARY + echo "A member of @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY