From ce660623c84df7b7e27b6dcc1ba363a6e894c8cb Mon Sep 17 00:00:00 2001
From: Yong Wu <yongcale@gmail.com>
Date: Fri, 23 Jan 2026 23:39:43 -0800
Subject: [PATCH 1/9] ci: Enable blackwell tests in public ci

---
 .github/workflows/pr-test.yml | 66 +++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 6e235d5e28..7f8bc935c9 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -34,6 +34,10 @@ on:
         description: 'Skip GPU tests'
         type: boolean
         default: false
+      run_b200:
+        description: 'Run B200 tests'
+        type: boolean
+        default: false
 
 concurrency:
   group: pr-test-${{ github.ref }}
@@ -644,6 +648,56 @@ jobs:
       - name: Run JIT Unittest Part 3 (T4)
         run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh
 
+  # ---------------------------------------------------------------------------
+  # GPU JIT Tests - B200 (Blackwell) - Capacity Block
+  # Triggers dynamic CB purchase via EventBridge → CB Manager Lambda
+  # Only runs when explicitly enabled via workflow_dispatch
+  # ---------------------------------------------------------------------------
+  gpu-tests-b200:
+    name: JIT Unittest (B200)
+    needs: setup
+    if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.run_b200 == 'true'
+    runs-on: [self-hosted, Linux, X64, gpu, b200]
+    timeout-minutes: 360
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu130:${{ needs.setup.outputs.docker_tag }}
+    steps:
+      - name: Cleanup
+        run: |
+          # Stop all Docker containers to free GPU memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+          # Show GPU info (should show 1 GPU due to CUDA_VISIBLE_DEVICES)
+          echo "=== GPU Info ==="
+          nvidia-smi || true
+          echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
+
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
+        env:
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run B200 Kernel Tests
+        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_test_blackwell_kernels.sh
+
   # ---------------------------------------------------------------------------
   # Test Results Summary
   # ---------------------------------------------------------------------------
@@ -662,6 +716,7 @@ jobs:
       - gpu-tests-t4
       - analyze-gpu-t4-failure
       - gpu-tests-t4-rerun
+      - gpu-tests-b200
     runs-on: ubuntu-latest
     steps:
       - name: Check Results
@@ -721,6 +776,17 @@ jobs:
             "${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \
             "${{ needs.gpu-tests-t4-rerun.result }}" || FAILED=true
 
+          # B200 tests (optional, no rerun logic yet)
+          echo "" >> $GITHUB_STEP_SUMMARY
+          B200="${{ needs.gpu-tests-b200.result }}"
+          RUN_B200="${{ github.event.inputs.run_b200 }}"
+          if [ "$RUN_B200" == "true" ]; then
+            echo "GPU Tests (B200): $B200" >> $GITHUB_STEP_SUMMARY
+            if [ "$B200" != "success" ] && [ "$B200" != "skipped" ]; then
+              FAILED=true
+            fi
+          fi
+
           echo "" >> $GITHUB_STEP_SUMMARY
           if [ "$FAILED" == "true" ]; then
             echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY

From 6f31f15b0e4125500ea072c9abe85e4a86c66748 Mon Sep 17 00:00:00 2001
From: Yong Wu <yongcale@gmail.com>
Date: Fri, 23 Jan 2026 23:45:59 -0800
Subject: [PATCH 2/9] remove skip

---
 .github/workflows/pr-test.yml | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 7f8bc935c9..6e92fa5b07 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -34,10 +34,6 @@ on:
         description: 'Skip GPU tests'
         type: boolean
         default: false
-      run_b200:
-        description: 'Run B200 tests'
-        type: boolean
-        default: false
 
 concurrency:
   group: pr-test-${{ github.ref }}
@@ -656,7 +652,7 @@ jobs:
   gpu-tests-b200:
     name: JIT Unittest (B200)
     needs: setup
-    if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.run_b200 == 'true'
+    if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true'
     runs-on: [self-hosted, Linux, X64, gpu, b200]
     timeout-minutes: 360
     env:
@@ -776,15 +772,12 @@ jobs:
             "${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \
             "${{ needs.gpu-tests-t4-rerun.result }}" || FAILED=true
 
-          # B200 tests (optional, no rerun logic yet)
+          # B200 tests (no rerun logic yet - CB instances don't get spot terminated)
           echo "" >> $GITHUB_STEP_SUMMARY
           B200="${{ needs.gpu-tests-b200.result }}"
-          RUN_B200="${{ github.event.inputs.run_b200 }}"
-          if [ "$RUN_B200" == "true" ]; then
-            echo "GPU Tests (B200): $B200" >> $GITHUB_STEP_SUMMARY
-            if [ "$B200" != "success" ] && [ "$B200" != "skipped" ]; then
-              FAILED=true
-            fi
+          echo "GPU Tests (B200): $B200" >> $GITHUB_STEP_SUMMARY
+          if [ "$B200" != "success" ] && [ "$B200" != "skipped" ] && [ "${{ github.event.inputs.skip_gpu }}" != "true" ]; then
+            FAILED=true
           fi
 
           echo "" >> $GITHUB_STEP_SUMMARY

From f2db27ec92d06a1ec7d94ef921941168e4628870 Mon Sep 17 00:00:00 2001
From: Yong Wu <yongcale@gmail.com>
Date: Sat, 24 Jan 2026 02:51:50 -0800
Subject: [PATCH 3/9] fix: make task_test_blackwell_kernels.sh executable

---
 scripts/task_test_blackwell_kernels.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 scripts/task_test_blackwell_kernels.sh

diff --git a/scripts/task_test_blackwell_kernels.sh b/scripts/task_test_blackwell_kernels.sh
old mode 100644
new mode 100755

From c657a52c2701565d6a4e21e046548743d7bbfe8b Mon Sep 17 00:00:00 2001
From: Yong Wu <yongcale@gmail.com>
Date: Sat, 24 Jan 2026 03:10:35 -0800
Subject: [PATCH 4/9] Enable H100

---
 .github/workflows/pr-test.yml | 59 ++++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 6e92fa5b07..e4e619da4b 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -647,7 +647,6 @@ jobs:
   # ---------------------------------------------------------------------------
   # GPU JIT Tests - B200 (Blackwell) - Capacity Block
   # Triggers dynamic CB purchase via EventBridge → CB Manager Lambda
-  # Only runs when explicitly enabled via workflow_dispatch
   # ---------------------------------------------------------------------------
   gpu-tests-b200:
     name: JIT Unittest (B200)
@@ -694,6 +693,55 @@ jobs:
       - name: Run B200 Kernel Tests
         run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_test_blackwell_kernels.sh
 
+  # ---------------------------------------------------------------------------
+  # GPU JIT Tests - H100 (Hopper) - Capacity Block
+  # Triggers dynamic CB purchase via EventBridge → CB Manager Lambda
+  # ---------------------------------------------------------------------------
+  gpu-tests-h100:
+    name: JIT Unittest (H100)
+    needs: setup
+    if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true'
+    runs-on: [self-hosted, Linux, X64, gpu, h100]
+    timeout-minutes: 360
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
+    steps:
+      - name: Cleanup
+        run: |
+          # Stop all Docker containers to free GPU memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+          # Show GPU info (should show 1 GPU due to CUDA_VISIBLE_DEVICES)
+          echo "=== GPU Info ==="
+          nvidia-smi || true
+          echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
+
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
+        env:
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run H100 Kernel Tests
+        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_test_blackwell_kernels.sh
+
   # ---------------------------------------------------------------------------
   # Test Results Summary
   # ---------------------------------------------------------------------------
@@ -713,6 +761,7 @@ jobs:
       - analyze-gpu-t4-failure
       - gpu-tests-t4-rerun
       - gpu-tests-b200
+      - gpu-tests-h100
     runs-on: ubuntu-latest
     steps:
       - name: Check Results
@@ -780,6 +829,14 @@ jobs:
             FAILED=true
           fi
 
+          # H100 tests (no rerun logic yet - CB instances don't get spot terminated)
+          echo "" >> $GITHUB_STEP_SUMMARY
+          H100="${{ needs.gpu-tests-h100.result }}"
+          echo "GPU Tests (H100): $H100" >> $GITHUB_STEP_SUMMARY
+          if [ "$H100" != "success" ] && [ "$H100" != "skipped" ] && [ "${{ github.event.inputs.skip_gpu }}" != "true" ]; then
+            FAILED=true
+          fi
+
           echo "" >> $GITHUB_STEP_SUMMARY
           if [ "$FAILED" == "true" ]; then
             echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY

From 666a005d0b896808061f13f05457c89757959ee0 Mon Sep 17 00:00:00 2001
From: Yong Wu <yongcale@gmail.com>
Date: Fri, 30 Jan 2026 13:35:16 -0800
Subject: [PATCH 5/9] ci: improve spot termination detection for automatic
 reruns

- Check job metadata/annotations for operation was canceled errors
- Treat failed log downloads as infrastructure failures
- Fixes cases where spot termination happens too fast for monitor script
---
 .github/workflows/pr-test.yml | 45 ++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index e4e619da4b..cc49ba1706 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -256,9 +256,20 @@ jobs:
             exit 0
           fi
           for JOB_ID in $FAILED_JOBS; do
-            # Download logs (may be ZIP or plain text depending on GitHub API)
+            # First check job metadata for runner communication errors
+            # This catches "The operation was canceled" which appears in annotations, not logs
+            JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true)
+            if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then
+              echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+
+            # Try to download logs - if we can't, likely infrastructure failure
             if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
-              continue
+              echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
             fi
             # Try to unzip if it's a ZIP file, otherwise use as-is
             if file job_log.zip | grep -q "Zip archive"; then
@@ -424,9 +435,20 @@ jobs:
             exit 0
           fi
           for JOB_ID in $FAILED_JOBS; do
-            # Download logs (may be ZIP or plain text depending on GitHub API)
+            # First check job metadata for runner communication errors
+            # This catches "The operation was canceled" which appears in annotations, not logs
+            JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true)
+            if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then
+              echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+
+            # Try to download logs - if we can't, likely infrastructure failure
             if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
-              continue
+              echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
             fi
             # Try to unzip if it's a ZIP file, otherwise use as-is
             if file job_log.zip | grep -q "Zip archive"; then
@@ -576,9 +598,20 @@ jobs:
             exit 0
           fi
           for JOB_ID in $FAILED_JOBS; do
-            # Download logs (may be ZIP or plain text depending on GitHub API)
+            # First check job metadata for runner communication errors
+            # This catches "The operation was canceled" which appears in annotations, not logs
+            JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true)
+            if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then
+              echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+
+            # Try to download logs - if we can't, likely infrastructure failure
             if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
-              continue
+              echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
             fi
             # Try to unzip if it's a ZIP file, otherwise use as-is
             if file job_log.zip | grep -q "Zip archive"; then

From ccad1756c4efc1e609d31d5b126af88bc0e6bd82 Mon Sep 17 00:00:00 2001
From: Yong Wu <yongcale@gmail.com>
Date: Fri, 30 Jan 2026 13:48:49 -0800
Subject: [PATCH 6/9] ci: add gate dependency to B200/H100 jobs for
 authorization check

---
 .github/workflows/pr-test.yml | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index cc49ba1706..0d6404df23 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -683,8 +683,11 @@ jobs:
   # ---------------------------------------------------------------------------
   gpu-tests-b200:
     name: JIT Unittest (B200)
-    needs: setup
-    if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true'
+    needs: [gate, setup]
+    if: |
+      needs.gate.outputs.authorized == 'true' &&
+      needs.setup.outputs.skip_build != 'true' &&
+      github.event.inputs.skip_gpu != 'true'
     runs-on: [self-hosted, Linux, X64, gpu, b200]
     timeout-minutes: 360
     env:
@@ -732,8 +735,11 @@ jobs:
   # ---------------------------------------------------------------------------
   gpu-tests-h100:
     name: JIT Unittest (H100)
-    needs: setup
-    if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true'
+    needs: [gate, setup]
+    if: |
+      needs.gate.outputs.authorized == 'true' &&
+      needs.setup.outputs.skip_build != 'true' &&
+      github.event.inputs.skip_gpu != 'true'
     runs-on: [self-hosted, Linux, X64, gpu, h100]
     timeout-minutes: 360
     env:

From 335e0937715a5fa53af5ae8c7dcc30d89b30f0d9 Mon Sep 17 00:00:00 2001
From: Yong Wu <yongcale@gmail.com>
Date: Fri, 30 Jan 2026 22:14:55 -0800
Subject: [PATCH 7/9] update labels

---
 .github/workflows/pr-test.yml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 0d6404df23..9c9ac2a015 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -184,7 +184,7 @@ jobs:
       github.event.inputs.skip_aot != 'true'
     runs-on:
       - self-hosted
-      - Linux
+      - linux
       - ${{ matrix.arch }}
       - cpu
       - spot
@@ -192,7 +192,7 @@ jobs:
     strategy:
       fail-fast: true
       matrix:
-        arch: [X64, ARM64]
+        arch: [x64, arm64]
         cuda: [cu126, cu128, cu129, cu130]
     env:
       DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }}
@@ -296,7 +296,7 @@ jobs:
         if: steps.analyze.outputs.is_spot_termination == 'true'
         run: |
           MATRIX='{"include":['
-          for arch in X64 ARM64; do
+          for arch in x64 arm64; do
             for cuda in cu126 cu128 cu129 cu130; do
               MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},'
             done
@@ -313,7 +313,7 @@ jobs:
       needs.analyze-aot-failure.outputs.rerun_matrix != ''
     runs-on:
       - self-hosted
-      - Linux
+      - linux
       - ${{ matrix.arch }}
       - cpu
       - on-demand
@@ -366,7 +366,7 @@ jobs:
       needs.gate.outputs.authorized == 'true' &&
       needs.setup.outputs.skip_build != 'true' &&
       github.event.inputs.skip_gpu != 'true'
-    runs-on: [self-hosted, Linux, X64, gpu, sm86, spot]
+    runs-on: [self-hosted, linux, x64, gpu, sm86, spot]
     timeout-minutes: 360
     strategy:
       fail-fast: true
@@ -483,7 +483,7 @@ jobs:
       !cancelled() &&
       needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' &&
       needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != ''
-    runs-on: [self-hosted, Linux, X64, gpu, sm86, on-demand]
+    runs-on: [self-hosted, linux, x64, gpu, sm86, on-demand]
     timeout-minutes: 360
     strategy:
       fail-fast: true
@@ -534,7 +534,7 @@ jobs:
       needs.gate.outputs.authorized == 'true' &&
       needs.setup.outputs.skip_build != 'true' &&
       github.event.inputs.skip_gpu != 'true'
-    runs-on: [self-hosted, Linux, X64, gpu, sm75, spot]
+    runs-on: [self-hosted, linux, x64, gpu, sm75, spot]
     timeout-minutes: 360
     env:
       DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
@@ -639,7 +639,7 @@ jobs:
     if: |
       !cancelled() &&
       needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true'
-    runs-on: [self-hosted, Linux, X64, gpu, sm75, on-demand]
+    runs-on: [self-hosted, linux, x64, gpu, sm75, on-demand]
     timeout-minutes: 360
     env:
       DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
@@ -679,7 +679,7 @@ jobs:
 
   # ---------------------------------------------------------------------------
   # GPU JIT Tests - B200 (Blackwell) - Capacity Block
-  # Triggers dynamic CB purchase via EventBridge → CB Manager Lambda
+  # Requires manually purchased CB via AWS Console
   # ---------------------------------------------------------------------------
   gpu-tests-b200:
     name: JIT Unittest (B200)
@@ -688,7 +688,7 @@ jobs:
       needs.gate.outputs.authorized == 'true' &&
       needs.setup.outputs.skip_build != 'true' &&
       github.event.inputs.skip_gpu != 'true'
-    runs-on: [self-hosted, Linux, X64, gpu, b200]
+    runs-on: [self-hosted, linux, x64, gpu, b200, 1gpu]
     timeout-minutes: 360
     env:
       DOCKER_IMAGE: flashinfer/flashinfer-ci-cu130:${{ needs.setup.outputs.docker_tag }}
@@ -731,7 +731,7 @@ jobs:
 
   # ---------------------------------------------------------------------------
   # GPU JIT Tests - H100 (Hopper) - Capacity Block
-  # Triggers dynamic CB purchase via EventBridge → CB Manager Lambda
+  # Requires manually purchased CB via AWS Console
   # ---------------------------------------------------------------------------
   gpu-tests-h100:
     name: JIT Unittest (H100)
@@ -740,7 +740,7 @@ jobs:
       needs.gate.outputs.authorized == 'true' &&
       needs.setup.outputs.skip_build != 'true' &&
       github.event.inputs.skip_gpu != 'true'
-    runs-on: [self-hosted, Linux, X64, gpu, h100]
+    runs-on: [self-hosted, linux, x64, gpu, h100, 1gpu]
     timeout-minutes: 360
     env:
       DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}

From bb247c3c5dadba14e359d8f04ddc4b022abee7be Mon Sep 17 00:00:00 2001
From: Yong Wu <yongcale@gmail.com>
Date: Sat, 31 Jan 2026 07:29:53 -0800
Subject: [PATCH 8/9] Replace docker system prune with targeted cleanup to
 preserve cached images

---
 .github/workflows/pr-test.yml | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 9c9ac2a015..406153b6e8 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -206,7 +206,8 @@ jobs:
           sudo rm -rf ${{ github.workspace }}/* || true
           sudo rm -rf ${{ github.workspace }}/.[!.]* || true
           rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
+          docker image prune -f || true
+          docker builder prune -f --filter "until=24h" || true
 
       - uses: actions/checkout@v4
         with:
@@ -333,7 +334,8 @@ jobs:
           sudo rm -rf ${{ github.workspace }}/* || true
           sudo rm -rf ${{ github.workspace }}/.[!.]* || true
           rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
+          docker image prune -f || true
+          docker builder prune -f --filter "until=24h" || true
 
       - uses: actions/checkout@v4
         with:
@@ -384,7 +386,8 @@ jobs:
           sudo rm -rf ${{ github.workspace }}/* || true
           sudo rm -rf ${{ github.workspace }}/.[!.]* || true
           rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
+          docker image prune -f || true
+          docker builder prune -f --filter "until=24h" || true
           nvidia-smi || true
 
       - uses: actions/checkout@v4
@@ -500,7 +503,8 @@ jobs:
           sudo rm -rf ${{ github.workspace }}/* || true
           sudo rm -rf ${{ github.workspace }}/.[!.]* || true
           rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
+          docker image prune -f || true
+          docker builder prune -f --filter "until=24h" || true
           nvidia-smi || true
 
       - uses: actions/checkout@v4
@@ -548,7 +552,8 @@ jobs:
           sudo rm -rf ${{ github.workspace }}/* || true
           sudo rm -rf ${{ github.workspace }}/.[!.]* || true
           rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
+          docker image prune -f || true
+          docker builder prune -f --filter "until=24h" || true
           nvidia-smi || true
 
       - uses: actions/checkout@v4
@@ -653,7 +658,8 @@ jobs:
           sudo rm -rf ${{ github.workspace }}/* || true
           sudo rm -rf ${{ github.workspace }}/.[!.]* || true
           rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
+          docker image prune -f || true
+          docker builder prune -f --filter "until=24h" || true
           nvidia-smi || true
 
       - uses: actions/checkout@v4
@@ -702,7 +708,8 @@ jobs:
           sudo rm -rf ${{ github.workspace }}/* || true
           sudo rm -rf ${{ github.workspace }}/.[!.]* || true
           rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
+          docker image prune -f || true
+          docker builder prune -f --filter "until=24h" || true
           # Show GPU info (should show 1 GPU due to CUDA_VISIBLE_DEVICES)
           echo "=== GPU Info ==="
           nvidia-smi || true
@@ -754,7 +761,8 @@ jobs:
           sudo rm -rf ${{ github.workspace }}/* || true
           sudo rm -rf ${{ github.workspace }}/.[!.]* || true
           rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
+          docker image prune -f || true
+          docker builder prune -f --filter "until=24h" || true
           # Show GPU info (should show 1 GPU due to CUDA_VISIBLE_DEVICES)
           echo "=== GPU Info ==="
           nvidia-smi || true

From 3dd308ac91c3d7933c5c3f017c2f8a590415671d Mon Sep 17 00:00:00 2001
From: Yong Wu <yongcale@gmail.com>
Date: Sun, 1 Feb 2026 16:56:03 -0800
Subject: [PATCH 9/9] remove Docker login, extract spot analysis to script

---
 .github/workflows/pr-test.yml | 204 ++++------------------------------
 scripts/task_analyze_spot.sh  |  85 ++++++++++++++
 2 files changed, 106 insertions(+), 183 deletions(-)
 create mode 100644 scripts/task_analyze_spot.sh

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 406153b6e8..605ec0a568 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -216,12 +216,6 @@ jobs:
       - name: Start spot termination monitor
         run: ./scripts/task_monitor_spot.sh &
 
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
@@ -242,55 +236,17 @@ jobs:
       is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
       rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }}
     steps:
+      - name: Checkout scripts
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: scripts
+          sparse-checkout-cone-mode: false
+
       - name: Analyze failure from job logs
         id: analyze
         env:
           GH_TOKEN: ${{ github.token }}
-        run: |
-          RUN_ID="${{ github.run_id }}"
-          SPOT_TERMINATION=false
-          # Include both failed and cancelled jobs (spot termination can cause either)
-          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
-            --jq '.jobs[] | select(.name | startswith("AOT")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
-          if [ -z "$FAILED_JOBS" ]; then
-            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
-            exit 0
-          fi
-          for JOB_ID in $FAILED_JOBS; do
-            # First check job metadata for runner communication errors
-            # This catches "The operation was canceled" which appears in annotations, not logs
-            JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true)
-            if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then
-              echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-
-            # Try to download logs - if we can't, likely infrastructure failure
-            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
-              echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-            # Try to unzip if it's a ZIP file, otherwise use as-is
-            if file job_log.zip | grep -q "Zip archive"; then
-              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
-            else
-              mv job_log.zip job_log.txt
-            fi
-            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
-              echo "Detected: AWS spot termination marker (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-            if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
-              echo "Detected: infrastructure error pattern (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-          done
-          echo "is_spot_termination=$SPOT_TERMINATION"
-          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
+        run: ./scripts/task_analyze_spot.sh 'startswith("AOT")' '${{ github.repository }}' '${{ github.run_id }}'
 
       - name: Build rerun matrix
         id: matrix
@@ -341,12 +297,6 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
@@ -397,12 +347,6 @@ jobs:
       - name: Start spot termination monitor
         run: ./scripts/task_monitor_spot.sh &
 
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
@@ -423,55 +367,17 @@ jobs:
       is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
       rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }}
     steps:
+      - name: Checkout scripts
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: scripts
+          sparse-checkout-cone-mode: false
+
       - name: Analyze failure from job logs
         id: analyze
         env:
           GH_TOKEN: ${{ github.token }}
-        run: |
-          RUN_ID="${{ github.run_id }}"
-          SPOT_TERMINATION=false
-          # Include both failed and cancelled jobs (spot termination can cause either)
-          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
-            --jq '.jobs[] | select(.name | contains("A10G")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
-          if [ -z "$FAILED_JOBS" ]; then
-            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
-            exit 0
-          fi
-          for JOB_ID in $FAILED_JOBS; do
-            # First check job metadata for runner communication errors
-            # This catches "The operation was canceled" which appears in annotations, not logs
-            JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true)
-            if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then
-              echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-
-            # Try to download logs - if we can't, likely infrastructure failure
-            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
-              echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-            # Try to unzip if it's a ZIP file, otherwise use as-is
-            if file job_log.zip | grep -q "Zip archive"; then
-              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
-            else
-              mv job_log.zip job_log.txt
-            fi
-            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
-              echo "Detected: AWS spot termination marker (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-            if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
-              echo "Detected: infrastructure error pattern (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-          done
-          echo "is_spot_termination=$SPOT_TERMINATION"
-          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
+        run: ./scripts/task_analyze_spot.sh 'contains("A10G")' '${{ github.repository }}' '${{ github.run_id }}'
 
       - name: Build rerun matrix
         id: matrix
@@ -511,12 +417,6 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
@@ -563,12 +463,6 @@ jobs:
       - name: Start spot termination monitor
         run: ./scripts/task_monitor_spot.sh &
 
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
@@ -588,55 +482,17 @@ jobs:
     outputs:
       is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
     steps:
+      - name: Checkout scripts
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: scripts
+          sparse-checkout-cone-mode: false
+
       - name: Analyze failure from job logs
         id: analyze
         env:
           GH_TOKEN: ${{ github.token }}
-        run: |
-          RUN_ID="${{ github.run_id }}"
-          SPOT_TERMINATION=false
-          # Include both failed and cancelled jobs (spot termination can cause either)
-          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
-            --jq '.jobs[] | select(.name | contains("T4")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
-          if [ -z "$FAILED_JOBS" ]; then
-            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
-            exit 0
-          fi
-          for JOB_ID in $FAILED_JOBS; do
-            # First check job metadata for runner communication errors
-            # This catches "The operation was canceled" which appears in annotations, not logs
-            JOB_INFO=$(gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}" 2>/dev/null || true)
-            if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then
-              echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-
-            # Try to download logs - if we can't, likely infrastructure failure
-            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
-              echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-            # Try to unzip if it's a ZIP file, otherwise use as-is
-            if file job_log.zip | grep -q "Zip archive"; then
-              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
-            else
-              mv job_log.zip job_log.txt
-            fi
-            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
-              echo "Detected: AWS spot termination marker (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-            if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
-              echo "Detected: infrastructure error pattern (job $JOB_ID)"
-              SPOT_TERMINATION=true
-              break
-            fi
-          done
-          echo "is_spot_termination=$SPOT_TERMINATION"
-          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
+        run: ./scripts/task_analyze_spot.sh 'contains("T4")' '${{ github.repository }}' '${{ github.run_id }}'
 
   gpu-tests-t4-rerun:
     name: JIT Rerun (T4)
@@ -666,12 +522,6 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
@@ -719,12 +569,6 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
@@ -772,12 +616,6 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
diff --git a/scripts/task_analyze_spot.sh b/scripts/task_analyze_spot.sh
new file mode 100644
index 0000000000..5116104ca0
--- /dev/null
+++ b/scripts/task_analyze_spot.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright (c) 2026 by FlashInfer team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+JOB_FILTER="${1:-}"
+REPOSITORY="${2:-}"
+RUN_ID="${3:-}"
+
+if [ -z "$JOB_FILTER" ] || [ -z "$REPOSITORY" ] || [ -z "$RUN_ID" ]; then
+  echo "Usage: $0 <job_filter> <repository> <run_id>"
+  echo "Example: $0 'startswith(\"AOT\")' 'flashinfer-ai/flashinfer' '12345'"
+  exit 1
+fi
+
+SPOT_TERMINATION=false
+
+# Temp file for job logs (cleaned up on exit)
+LOG_FILE="/tmp/job_log.txt"
+cleanup() { rm -f "$LOG_FILE" "${LOG_FILE}.zip"; }
+trap cleanup EXIT
+
+# Include both failed and cancelled jobs (spot termination can cause either)
+FAILED_JOBS=$(gh api "/repos/${REPOSITORY}/actions/runs/${RUN_ID}/jobs?per_page=100" \
+  --jq ".jobs[] | select(.name | ${JOB_FILTER}) | select(.conclusion == \"failure\" or .conclusion == \"cancelled\") | .id")
+
+if [ -z "$FAILED_JOBS" ]; then
+  echo "No failed jobs matching filter: ${JOB_FILTER}"
+  echo "is_spot_termination=false" >> "$GITHUB_OUTPUT"
+  exit 0
+fi
+
+for JOB_ID in $FAILED_JOBS; do
+  # First check job metadata for runner communication errors
+  # This catches "The operation was canceled" which appears in annotations, not logs
+  JOB_INFO=$(gh api "/repos/${REPOSITORY}/actions/jobs/${JOB_ID}" 2>/dev/null || true)
+  if echo "$JOB_INFO" | grep -qiE "operation was canceled|runner.*lost|lost communication"; then
+    echo "Detected: Runner lost communication or operation canceled (job $JOB_ID)"
+    SPOT_TERMINATION=true
+    break
+  fi
+
+  # Try to download job logs to /tmp
+  if ! gh api "/repos/${REPOSITORY}/actions/jobs/${JOB_ID}/logs" > "${LOG_FILE}.zip" 2>/dev/null; then
+    echo "Detected: Cannot download logs, likely infrastructure failure (job $JOB_ID)"
+    SPOT_TERMINATION=true
+    break
+  fi
+
+  # Handle both zip and plain text log formats
+  if file "${LOG_FILE}.zip" | grep -q "Zip archive"; then
+    unzip -p "${LOG_FILE}.zip" > "$LOG_FILE" 2>/dev/null || mv "${LOG_FILE}.zip" "$LOG_FILE"
+  else
+    mv "${LOG_FILE}.zip" "$LOG_FILE"
+  fi
+
+  # Check for spot termination marker from task_monitor_spot.sh
+  if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" "$LOG_FILE"; then
+    echo "Detected: AWS spot termination marker (job $JOB_ID)"
+    SPOT_TERMINATION=true
+    break
+  fi
+
+  # Check for infrastructure error patterns
+  if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" "$LOG_FILE"; then
+    echo "Detected: infrastructure error pattern (job $JOB_ID)"
+    SPOT_TERMINATION=true
+    break
+  fi
+done
+
+echo "is_spot_termination=$SPOT_TERMINATION"
+echo "is_spot_termination=$SPOT_TERMINATION" >> "$GITHUB_OUTPUT"