From ce660623c84df7b7e27b6dcc1ba363a6e894c8cb Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Fri, 23 Jan 2026 23:39:43 -0800 Subject: [PATCH 1/9] ci: Enable blackwell tests in public ci --- .github/workflows/pr-test.yml | 66 +++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 6e235d5e28..7f8bc935c9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -34,6 +34,10 @@ on: description: 'Skip GPU tests' type: boolean default: false + run_b200: + description: 'Run B200 tests' + type: boolean + default: false concurrency: group: pr-test-${{ github.ref }} @@ -644,6 +648,56 @@ jobs: - name: Run JIT Unittest Part 3 (T4) run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh + # --------------------------------------------------------------------------- + # GPU JIT Tests - B200 (Blackwell) - Capacity Block + # Triggers dynamic CB purchase via EventBridge → CB Manager Lambda + # Only runs when explicitly enabled via workflow_dispatch + # --------------------------------------------------------------------------- + gpu-tests-b200: + name: JIT Unittest (B200) + needs: setup + if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.run_b200 == 'true' + runs-on: [self-hosted, Linux, X64, gpu, b200] + timeout-minutes: 360 + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu130:${{ needs.setup.outputs.docker_tag }} + steps: + - name: Cleanup + run: | + # Stop all Docker containers to free GPU memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + # Show GPU info (should show 1 GPU due to CUDA_VISIBLE_DEVICES) + echo "=== GPU Info ===" + nvidia-smi || true + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" + + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run B200 Kernel Tests + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_test_blackwell_kernels.sh + # --------------------------------------------------------------------------- # Test Results Summary # --------------------------------------------------------------------------- @@ -662,6 +716,7 @@ jobs: - gpu-tests-t4 - analyze-gpu-t4-failure - gpu-tests-t4-rerun + - gpu-tests-b200 runs-on: ubuntu-latest steps: - name: Check Results @@ -721,6 +776,17 @@ jobs: "${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \ "${{ needs.gpu-tests-t4-rerun.result }}" || FAILED=true + # B200 tests (optional, no rerun logic yet) + echo "" >> $GITHUB_STEP_SUMMARY + B200="${{ needs.gpu-tests-b200.result }}" + RUN_B200="${{ github.event.inputs.run_b200 }}" + if [ "$RUN_B200" == "true" ]; then + echo "GPU Tests (B200): $B200" >> $GITHUB_STEP_SUMMARY + if [ "$B200" != "success" ] && [ "$B200" != "skipped" ]; then + FAILED=true + fi + fi + echo "" >> $GITHUB_STEP_SUMMARY if [ "$FAILED" == "true" ]; then echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY From 6f31f15b0e4125500ea072c9abe85e4a86c66748 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Fri, 23 Jan 2026 23:45:59 -0800 Subject: [PATCH 2/9] remove skip --- .github/workflows/pr-test.yml | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 7f8bc935c9..6e92fa5b07 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -34,10 +34,6 @@ on: description: 'Skip GPU tests' type: boolean default: false - run_b200: - description: 'Run B200 tests' - type: boolean - default: false concurrency: group: pr-test-${{ github.ref }} @@ -656,7 +652,7 @@ jobs: gpu-tests-b200: name: JIT Unittest (B200) needs: setup - if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.run_b200 == 'true' + if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' runs-on: [self-hosted, Linux, X64, gpu, b200] timeout-minutes: 360 env: @@ -776,15 +772,12 @@ jobs: "${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \ "${{ needs.gpu-tests-t4-rerun.result }}" || FAILED=true - # B200 tests (optional, no rerun logic yet) + # B200 tests (no rerun logic yet - CB instances don't get spot terminated) echo "" >> $GITHUB_STEP_SUMMARY B200="${{ needs.gpu-tests-b200.result }}" - RUN_B200="${{ github.event.inputs.run_b200 }}" - if [ "$RUN_B200" == "true" ]; then - echo "GPU Tests (B200): $B200" >> $GITHUB_STEP_SUMMARY - if [ "$B200" != "success" ] && [ "$B200" != "skipped" ]; then - FAILED=true - fi + echo "GPU Tests (B200): $B200" >> $GITHUB_STEP_SUMMARY + if [ "$B200" != "success" ] && [ "$B200" != "skipped" ] && [ "${{ github.event.inputs.skip_gpu }}" != "true" ]; then + FAILED=true fi echo "" >> $GITHUB_STEP_SUMMARY From f2db27ec92d06a1ec7d94ef921941168e4628870 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Sat, 24 Jan 2026 02:51:50 -0800 Subject: [PATCH 3/9] fix: make task_test_blackwell_kernels.sh executable --- scripts/task_test_blackwell_kernels.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/task_test_blackwell_kernels.sh diff --git a/scripts/task_test_blackwell_kernels.sh b/scripts/task_test_blackwell_kernels.sh old mode 100644 new mode 100755 From c657a52c2701565d6a4e21e046548743d7bbfe8b Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Sat, 24 Jan 2026 03:10:35 -0800 Subject: [PATCH 4/9] Enable H100 --- .github/workflows/pr-test.yml | 59 ++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 6e92fa5b07..e4e619da4b 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -647,7 +647,6 @@ jobs: # --------------------------------------------------------------------------- # GPU JIT Tests - B200 (Blackwell) - Capacity Block # Triggers dynamic CB purchase via EventBridge → CB Manager Lambda - # Only runs when explicitly enabled via workflow_dispatch # --------------------------------------------------------------------------- gpu-tests-b200: name: JIT Unittest (B200) @@ -694,6 +693,55 @@ jobs: - name: Run B200 Kernel Tests run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_test_blackwell_kernels.sh + # --------------------------------------------------------------------------- + # GPU JIT Tests - H100 (Hopper) - Capacity Block + # Triggers dynamic CB purchase via EventBridge → CB Manager Lambda + # --------------------------------------------------------------------------- + gpu-tests-h100: + name: JIT Unittest (H100) + needs: setup + if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' + runs-on: [self-hosted, Linux, X64, gpu, h100] + timeout-minutes: 360 + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} + steps: + - name: Cleanup + run: | + # Stop all Docker containers to free GPU memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + # Show GPU info (should show 1 GPU due to CUDA_VISIBLE_DEVICES) + echo "=== GPU Info ===" + nvidia-smi || true + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" + + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run H100 Kernel Tests + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_test_blackwell_kernels.sh + # --------------------------------------------------------------------------- # Test Results Summary # --------------------------------------------------------------------------- @@ -713,6 +761,7 @@ jobs: - analyze-gpu-t4-failure - gpu-tests-t4-rerun - gpu-tests-b200 + - gpu-tests-h100 runs-on: ubuntu-latest steps: - name: Check Results @@ -780,6 +829,14 @@ jobs: FAILED=true fi + # H100 tests (no rerun logic yet - CB instances don't get spot terminated) + echo "" >> $GITHUB_STEP_SUMMARY + H100="${{ needs.gpu-tests-h100.result }}" + echo "GPU Tests (H100): $H100" >> $GITHUB_STEP_SUMMARY + if [ "$H100" != "success" ] && [ "$H100" != "skipped" ] && [ "${{ github.event.inputs.skip_gpu }}" != "true" ]; then + FAILED=true + fi + echo "" >> $GITHUB_STEP_SUMMARY if [ "$FAILED" == "true" ]; then echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY From 666a005d0b896808061f13f05457c89757959ee0 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Fri, 30 Jan 2026 13:35:16 -0800 Subject: [PATCH 5/9] ci: improve spot termination detection for automatic reruns - Check job metadata/annotations for operation was canceled errors - Treat failed log downloads as infrastructure failures - Fixes cases where spot termination happens too fast for monitor script --- .github/workflows/pr-test.yml | 45 ++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index e4e619da4b..cc49ba1706 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -256,9 +256,20 @@ jobs: exit 0 fi for JOB_ID in $FAILED_JOBS; do - # Download logs (may be ZIP or plain text depending on GitHub API) + # First check job metadata for runner communication errors + # This catches "The operation was canceled" which appears in annotations, not logs + JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true) + if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then + echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + + # Try to download logs - if we can't, likely infrastructure failure if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - continue + echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)" + SPOT_TERMINATION=true + break fi # Try to unzip if it's a ZIP file, otherwise use as-is if file job_log.zip | grep -q "Zip archive"; then @@ -424,9 +435,20 @@ jobs: exit 0 fi for JOB_ID in $FAILED_JOBS; do - # Download logs (may be ZIP or plain text depending on GitHub API) + # First check job metadata for runner communication errors + # This catches "The operation was canceled" which appears in annotations, not logs + JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true) + if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then + echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + + # Try to download logs - if we can't, likely infrastructure failure if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - continue + echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)" + SPOT_TERMINATION=true + break fi # Try to unzip if it's a ZIP file, otherwise use as-is if file job_log.zip | grep -q "Zip archive"; then @@ -576,9 +598,20 @@ jobs: exit 0 fi for JOB_ID in $FAILED_JOBS; do - # Download logs (may be ZIP or plain text depending on GitHub API) + # First check job metadata for runner communication errors + # This catches "The operation was canceled" which appears in annotations, not logs + JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true) + if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then + echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + + # Try to download logs - if we can't, likely infrastructure failure if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - continue + echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)" + SPOT_TERMINATION=true + break fi # Try to unzip if it's a ZIP file, otherwise use as-is if file job_log.zip | grep -q "Zip archive"; then From ccad1756c4efc1e609d31d5b126af88bc0e6bd82 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Fri, 30 Jan 2026 13:48:49 -0800 Subject: [PATCH 6/9] ci: add gate dependency to B200/H100 jobs for authorization check --- .github/workflows/pr-test.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index cc49ba1706..0d6404df23 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -683,8 +683,11 @@ jobs: # --------------------------------------------------------------------------- gpu-tests-b200: name: JIT Unittest (B200) - needs: setup - if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' + needs: [gate, setup] + if: | + needs.gate.outputs.authorized == 'true' && + needs.setup.outputs.skip_build != 'true' && + github.event.inputs.skip_gpu != 'true' runs-on: [self-hosted, Linux, X64, gpu, b200] timeout-minutes: 360 env: @@ -732,8 +735,11 @@ jobs: # --------------------------------------------------------------------------- gpu-tests-h100: name: JIT Unittest (H100) - needs: setup - if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' + needs: [gate, setup] + if: | + needs.gate.outputs.authorized == 'true' && + needs.setup.outputs.skip_build != 'true' && + github.event.inputs.skip_gpu != 'true' runs-on: [self-hosted, Linux, X64, gpu, h100] timeout-minutes: 360 env: From 335e0937715a5fa53af5ae8c7dcc30d89b30f0d9 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Fri, 30 Jan 2026 22:14:55 -0800 Subject: [PATCH 7/9] update labels --- .github/workflows/pr-test.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 0d6404df23..9c9ac2a015 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -184,7 +184,7 @@ jobs: github.event.inputs.skip_aot != 'true' runs-on: - self-hosted - - Linux + - linux - ${{ matrix.arch }} - cpu - spot @@ -192,7 +192,7 @@ jobs: strategy: fail-fast: true matrix: - arch: [X64, ARM64] + arch: [x64, arm64] cuda: [cu126, cu128, cu129, cu130] env: DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }} @@ -296,7 +296,7 @@ jobs: if: steps.analyze.outputs.is_spot_termination == 'true' run: | MATRIX='{"include":[' - for arch in X64 ARM64; do + for arch in x64 arm64; do for cuda in cu126 cu128 cu129 cu130; do MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},' done @@ -313,7 +313,7 @@ jobs: needs.analyze-aot-failure.outputs.rerun_matrix != '' runs-on: - self-hosted - - Linux + - linux - ${{ matrix.arch }} - cpu - on-demand @@ -366,7 +366,7 @@ jobs: needs.gate.outputs.authorized == 'true' && needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm86, spot] + runs-on: [self-hosted, linux, x64, gpu, sm86, spot] timeout-minutes: 360 strategy: fail-fast: true @@ -483,7 +483,7 @@ jobs: !cancelled() && needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' && needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != '' - runs-on: [self-hosted, Linux, X64, gpu, sm86, on-demand] + runs-on: [self-hosted, linux, x64, gpu, sm86, on-demand] timeout-minutes: 360 strategy: fail-fast: true @@ -534,7 +534,7 @@ jobs: needs.gate.outputs.authorized == 'true' && needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm75, spot] + runs-on: [self-hosted, linux, x64, gpu, sm75, spot] timeout-minutes: 360 env: DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} @@ -639,7 +639,7 @@ jobs: if: | !cancelled() && needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm75, on-demand] + runs-on: [self-hosted, linux, x64, gpu, sm75, on-demand] timeout-minutes: 360 env: DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} @@ -679,7 +679,7 @@ jobs: # --------------------------------------------------------------------------- # GPU JIT Tests - B200 (Blackwell) - Capacity Block - # Triggers dynamic CB purchase via EventBridge → CB Manager Lambda + # Requires manually purchased CB via AWS Console # --------------------------------------------------------------------------- gpu-tests-b200: name: JIT Unittest (B200) @@ -688,7 +688,7 @@ jobs: needs.gate.outputs.authorized == 'true' && needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' - runs-on: [self-hosted, Linux, X64, gpu, b200] + runs-on: [self-hosted, linux, x64, gpu, b200, 1gpu] timeout-minutes: 360 env: DOCKER_IMAGE: flashinfer/flashinfer-ci-cu130:${{ needs.setup.outputs.docker_tag }} @@ -731,7 +731,7 @@ jobs: # --------------------------------------------------------------------------- # GPU JIT Tests - H100 (Hopper) - Capacity Block - # Triggers dynamic CB purchase via EventBridge → CB Manager Lambda + # Requires manually purchased CB via AWS Console # --------------------------------------------------------------------------- gpu-tests-h100: name: JIT Unittest (H100) @@ -740,7 +740,7 @@ jobs: needs.gate.outputs.authorized == 'true' && needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' - runs-on: [self-hosted, Linux, X64, gpu, h100] + runs-on: [self-hosted, linux, x64, gpu, h100, 1gpu] timeout-minutes: 360 env: DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} From bb247c3c5dadba14e359d8f04ddc4b022abee7be Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Sat, 31 Jan 2026 07:29:53 -0800 Subject: [PATCH 8/9] Replace docker system prune with targeted cleanup to preserve cached images --- .github/workflows/pr-test.yml | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 9c9ac2a015..406153b6e8 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -206,7 +206,8 @@ jobs: sudo rm -rf ${{ github.workspace }}/* || true sudo rm -rf ${{ github.workspace }}/.[!.]* || true rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true + docker image prune -f || true + docker builder prune -f --filter "until=24h" || true - uses: actions/checkout@v4 with: @@ -333,7 +334,8 @@ jobs: sudo rm -rf ${{ github.workspace }}/* || true sudo rm -rf ${{ github.workspace }}/.[!.]* || true rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true + docker image prune -f || true + docker builder prune -f --filter "until=24h" || true - uses: actions/checkout@v4 with: @@ -384,7 +386,8 @@ jobs: sudo rm -rf ${{ github.workspace }}/* || true sudo rm -rf ${{ github.workspace }}/.[!.]* || true rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true + docker image prune -f || true + docker builder prune -f --filter "until=24h" || true nvidia-smi || true - uses: actions/checkout@v4 @@ -500,7 +503,8 @@ jobs: sudo rm -rf ${{ github.workspace }}/* || true sudo rm -rf ${{ github.workspace }}/.[!.]* || true rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true + docker image prune -f || true + docker builder prune -f --filter "until=24h" || true nvidia-smi || true - uses: actions/checkout@v4 @@ -548,7 +552,8 @@ jobs: sudo rm -rf ${{ github.workspace }}/* || true sudo rm -rf ${{ github.workspace }}/.[!.]* || true rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true + docker image prune -f || true + docker builder prune -f --filter "until=24h" || true nvidia-smi || true - uses: actions/checkout@v4 @@ -653,7 +658,8 @@ jobs: sudo rm -rf ${{ github.workspace }}/* || true sudo rm -rf ${{ github.workspace }}/.[!.]* || true rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true + docker image prune -f || true + docker builder prune -f --filter "until=24h" || true nvidia-smi || true - uses: actions/checkout@v4 @@ -702,7 +708,8 @@ jobs: sudo rm -rf ${{ github.workspace }}/* || true sudo rm -rf ${{ github.workspace }}/.[!.]* || true rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true + docker image prune -f || true + docker builder prune -f --filter "until=24h" || true # Show GPU info (should show 1 GPU due to CUDA_VISIBLE_DEVICES) echo "=== GPU Info ===" nvidia-smi || true @@ -754,7 +761,8 @@ jobs: sudo rm -rf ${{ github.workspace }}/* || true sudo rm -rf ${{ github.workspace }}/.[!.]* || true rm -rf ~/.cache/flashinfer_jit || true - docker system prune -f || true + docker image prune -f || true + docker builder prune -f --filter "until=24h" || true # Show GPU info (should show 1 GPU due to CUDA_VISIBLE_DEVICES) echo "=== GPU Info ===" nvidia-smi || true From 3dd308ac91c3d7933c5c3f017c2f8a590415671d Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Sun, 1 Feb 2026 16:56:03 -0800 Subject: [PATCH 9/9] remove Docker login, extract spot analysis to script --- .github/workflows/pr-test.yml | 204 ++++------------------------------ scripts/task_analyze_spot.sh | 85 ++++++++++++++ 2 files changed, 106 insertions(+), 183 deletions(-) create mode 100644 scripts/task_analyze_spot.sh diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 406153b6e8..605ec0a568 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -216,12 +216,6 @@ jobs: - name: Start spot termination monitor run: ./scripts/task_monitor_spot.sh & - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh @@ -242,55 +236,17 @@ jobs: is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + sparse-checkout: scripts + sparse-checkout-cone-mode: false + - name: Analyze failure from job logs id: analyze env: GH_TOKEN: ${{ github.token }} - run: | - RUN_ID="${{ github.run_id }}" - SPOT_TERMINATION=false - # Include both failed and cancelled jobs (spot termination can cause either) - FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ - --jq '.jobs[] | select(.name | startswith("AOT")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') - if [ -z "$FAILED_JOBS" ]; then - echo "is_spot_termination=false" >> $GITHUB_OUTPUT - exit 0 - fi - for JOB_ID in $FAILED_JOBS; do - # First check job metadata for runner communication errors - # This catches "The operation was canceled" which appears in annotations, not logs - JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true) - if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then - echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - - # Try to download logs - if we can't, likely infrastructure failure - if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - # Try to unzip if it's a ZIP file, otherwise use as-is - if file job_log.zip | grep -q "Zip archive"; then - unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt - else - mv job_log.zip job_log.txt - fi - if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then - echo "Detected: AWS spot termination marker (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then - echo "Detected: infrastructure error pattern (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - done - echo "is_spot_termination=$SPOT_TERMINATION" - echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT + run: ./scripts/task_analyze_spot.sh 'startswith("AOT")' '${{ github.repository }}' '${{ github.run_id }}' - name: Build rerun matrix id: matrix @@ -341,12 +297,6 @@ jobs: with: submodules: recursive - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh @@ -397,12 +347,6 @@ jobs: - name: Start spot termination monitor run: ./scripts/task_monitor_spot.sh & - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh @@ -423,55 +367,17 @@ jobs: is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + sparse-checkout: scripts + sparse-checkout-cone-mode: false + - name: Analyze failure from job logs id: analyze env: GH_TOKEN: ${{ github.token }} - run: | - RUN_ID="${{ github.run_id }}" - SPOT_TERMINATION=false - # Include both failed and cancelled jobs (spot termination can cause either) - FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ - --jq '.jobs[] | select(.name | contains("A10G")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') - if [ -z "$FAILED_JOBS" ]; then - echo "is_spot_termination=false" >> $GITHUB_OUTPUT - exit 0 - fi - for JOB_ID in $FAILED_JOBS; do - # First check job metadata for runner communication errors - # This catches "The operation was canceled" which appears in annotations, not logs - JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true) - if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then - echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - - # Try to download logs - if we can't, likely infrastructure failure - if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - # Try to unzip if it's a ZIP file, otherwise use as-is - if file job_log.zip | grep -q "Zip archive"; then - unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt - else - mv job_log.zip job_log.txt - fi - if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then - echo "Detected: AWS spot termination marker (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then - echo "Detected: infrastructure error pattern (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - done - echo "is_spot_termination=$SPOT_TERMINATION" - echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT + run: ./scripts/task_analyze_spot.sh 'contains("A10G")' '${{ github.repository }}' '${{ github.run_id }}' - name: Build rerun matrix id: matrix @@ -511,12 +417,6 @@ jobs: with: submodules: recursive - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh @@ -563,12 +463,6 @@ jobs: - name: Start spot termination monitor run: ./scripts/task_monitor_spot.sh & - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh @@ -588,55 +482,17 @@ jobs: outputs: is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} steps: + - name: Checkout scripts + uses: actions/checkout@v4 + with: + sparse-checkout: scripts + sparse-checkout-cone-mode: false + - name: Analyze failure from job logs id: analyze env: GH_TOKEN: ${{ github.token }} - run: | - RUN_ID="${{ github.run_id }}" - SPOT_TERMINATION=false - # Include both failed and cancelled jobs (spot termination can cause either) - FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ - --jq '.jobs[] | select(.name | contains("T4")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') - if [ -z "$FAILED_JOBS" ]; then - echo "is_spot_termination=false" >> $GITHUB_OUTPUT - exit 0 - fi - for JOB_ID in $FAILED_JOBS; do - # First check job metadata for runner communication errors - # This catches "The operation was canceled" which appears in annotations, not logs - JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true) - if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then - echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - - # Try to download logs - if we can't, likely infrastructure failure - if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then - echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - # Try to unzip if it's a ZIP file, otherwise use as-is - if file job_log.zip | grep -q "Zip archive"; then - unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt - else - mv job_log.zip job_log.txt - fi - if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then - echo "Detected: AWS spot termination marker (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then - echo "Detected: infrastructure error pattern (job $JOB_ID)" - SPOT_TERMINATION=true - break - fi - done - echo "is_spot_termination=$SPOT_TERMINATION" - echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT + run: ./scripts/task_analyze_spot.sh 'contains("T4")' '${{ github.repository }}' '${{ github.run_id }}' gpu-tests-t4-rerun: name: JIT Rerun (T4) @@ -666,12 +522,6 @@ jobs: with: submodules: recursive - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh @@ -719,12 +569,6 @@ jobs: with: submodules: recursive - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh @@ -772,12 +616,6 @@ jobs: with: submodules: recursive - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: flashinfer - password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh diff --git a/scripts/task_analyze_spot.sh b/scripts/task_analyze_spot.sh new file mode 100644 index 0000000000..5116104ca0 --- /dev/null +++ b/scripts/task_analyze_spot.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Copyright (c) 2026 by FlashInfer team. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +JOB_FILTER="${1:-}" +REPOSITORY="${2:-}" +RUN_ID="${3:-}" + +if [ -z "$JOB_FILTER" ] || [ -z "$REPOSITORY" ] || [ -z "$RUN_ID" ]; then + echo "Usage: $0 " + echo "Example: $0 'startswith(\"AOT\")' 'flashinfer-ai/flashinfer' '12345'" + exit 1 +fi + +SPOT_TERMINATION=false + +# Temp file for job logs (cleaned up on exit) +LOG_FILE="/tmp/job_log.txt" +cleanup() { rm -f "$LOG_FILE" "${LOG_FILE}.zip"; } +trap cleanup EXIT + +# Include both failed and cancelled jobs (spot termination can cause either) +FAILED_JOBS=$(gh api "/repos/${REPOSITORY}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq ".jobs[] | select(.name | ${JOB_FILTER}) | select(.conclusion == \"failure\" or .conclusion == \"cancelled\") | .id") + +if [ -z "$FAILED_JOBS" ]; then + echo "No failed jobs matching filter: ${JOB_FILTER}" + echo "is_spot_termination=false" >> "$GITHUB_OUTPUT" + exit 0 +fi + +for JOB_ID in $FAILED_JOBS; do + # First check job metadata for runner communication errors + # This catches "The operation was canceled" which appears in annotations, not logs + JOB_INFO=$(gh api "/repos/${REPOSITORY}/actions/jobs/${JOB_ID}" 2>/dev/null || true) + if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then + echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + + # Try to download job logs to /tmp + if ! gh api "/repos/${REPOSITORY}/actions/jobs/${JOB_ID}/logs" > "${LOG_FILE}.zip" 2>/dev/null; then + echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + + # Handle both zip and plain text log formats + if file "${LOG_FILE}.zip" | grep -q "Zip archive"; then + unzip -p "${LOG_FILE}.zip" > "$LOG_FILE" 2>/dev/null || mv "${LOG_FILE}.zip" "$LOG_FILE" + else + mv "${LOG_FILE}.zip" "$LOG_FILE" + fi + + # Check for spot termination marker from task_monitor_spot.sh + if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" "$LOG_FILE"; then + echo "Detected: AWS spot termination marker (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + + # Check for infrastructure error patterns + if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" "$LOG_FILE"; then + echo "Detected: infrastructure error pattern (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi +done + +echo "is_spot_termination=$SPOT_TERMINATION" +echo "is_spot_termination=$SPOT_TERMINATION" >> "$GITHUB_OUTPUT"