From 05455c0ce6ad4b020508ae5aae1e773ac0052f8a Mon Sep 17 00:00:00 2001
From: Yong Wu <yongcale@gmail.com>
Date: Sun, 8 Feb 2026 13:56:14 -0800
Subject: [PATCH] Revert "ci: refactor PR tests to hide failed spot jobs from
 PR status (#2500)"

This reverts commit d5eaa429b1c2c3cc51fe078028551fef10ca9cc9.
---
 .github/workflows/pr-test-runner.yml | 674 --------------------------
 .github/workflows/pr-test.yml        | 685 ++++++++++++++++++++++-----
 2 files changed, 556 insertions(+), 803 deletions(-)
 delete mode 100644 .github/workflows/pr-test-runner.yml

diff --git a/.github/workflows/pr-test-runner.yml b/.github/workflows/pr-test-runner.yml
deleted file mode 100644
index 125e9ebb5b..0000000000
--- a/.github/workflows/pr-test-runner.yml
+++ /dev/null
@@ -1,674 +0,0 @@
-# PR Test Runner - Runs tests and updates check runs.
-# Triggered by pr-test.yml via workflow_dispatch. Not visible on PR status.
-
-name: PR Test Runner
-
-on:
-  workflow_dispatch:
-    inputs:
-      pr_head_sha:
-        description: 'PR head SHA for check run updates'
-        required: true
-        type: string
-      docker_tag:
-        description: 'Docker image tag'
-        required: true
-        type: string
-      aot_check_id:
-        description: 'AOT Build check run ID'
-        required: false
-        type: string
-      gpu_a10g_check_id:
-        description: 'GPU A10G check run ID'
-        required: false
-        type: string
-      gpu_t4_check_id:
-        description: 'GPU T4 check run ID'
-        required: false
-        type: string
-      summary_check_id:
-        description: 'Test Results Summary check run ID'
-        required: false
-        type: string
-      skip_aot:
-        description: 'Skip AOT build tests'
-        required: false
-        type: string
-        default: 'false'
-      skip_gpu:
-        description: 'Skip GPU tests'
-        required: false
-        type: string
-        default: 'false'
-      concurrency_key:
-        description: 'Concurrency group key for cancelling outdated runs'
-        required: false
-        type: string
-
-permissions:
-  contents: read
-  actions: read
-
-concurrency:
-  group: ${{ inputs.concurrency_key || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  EXECUTOR_NUMBER: "0"
-
-jobs:
-  aot-build-import:
-    name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }})
-    if: inputs.skip_aot != 'true'
-    runs-on:
-      - self-hosted
-      - Linux
-      - ${{ matrix.arch }}
-      - cpu
-      - spot
-    timeout-minutes: 360
-    strategy:
-      fail-fast: true
-      matrix:
-        arch: [X64, ARM64]
-        cuda: [cu126, cu128, cu129, cu130]
-    env:
-      DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ inputs.docker_tag }}
-    steps:
-      - name: Cleanup
-        run: |
-          docker stop $(docker ps -q) 2>/dev/null || true
-          docker rm $(docker ps -aq) 2>/dev/null || true
-          sudo rm -rf ${{ github.workspace }}/* || true
-          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
-          rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
-
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha }}
-          submodules: recursive
-
-      - name: Start spot termination monitor
-        run: ./scripts/task_monitor_spot.sh &
-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
-
-      - name: Show Node Info
-        run: ./scripts/task_show_node_info.sh
-        env:
-          NODE_NAME: ${{ runner.name }}
-          WORKSPACE: ${{ github.workspace }}
-          BUILD_NUMBER: ${{ github.run_number }}
-
-      - name: Run Test
-        run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh
-
-  analyze-aot-failure:
-    name: Analyze AOT Failure
-    needs: aot-build-import
-    if: "!cancelled() && inputs.skip_aot != 'true' && (needs.aot-build-import.result == 'failure' || needs.aot-build-import.result == 'cancelled')"
-    runs-on: ubuntu-latest
-    outputs:
-      is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
-      rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }}
-    steps:
-      - name: Analyze failure from job logs
-        id: analyze
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          RUN_ID="${{ github.run_id }}"
-          SPOT_TERMINATION=false
-          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
-            --jq '.jobs[] | select(.name | startswith("AOT")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
-          if [ -z "$FAILED_JOBS" ]; then
-            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
-            exit 0
-          fi
-          for JOB_ID in $FAILED_JOBS; do
-            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
-              continue
-            fi
-            if file job_log.zip | grep -q "Zip archive"; then
-              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
-            else
-              mv job_log.zip job_log.txt
-            fi
-            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
-              SPOT_TERMINATION=true
-              break
-            fi
-            if grep -qiE "connection reset by peer|context canceled|The operation was canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
-              SPOT_TERMINATION=true
-              break
-            fi
-          done
-          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
-
-      - name: Build rerun matrix
-        id: matrix
-        if: steps.analyze.outputs.is_spot_termination == 'true'
-        run: |
-          MATRIX='{"include":['
-          for arch in X64 ARM64; do
-            for cuda in cu126 cu128 cu129 cu130; do
-              MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},'
-            done
-          done
-          MATRIX="${MATRIX%,}]}"
-          echo "rerun_matrix=$MATRIX" >> $GITHUB_OUTPUT
-
-  aot-build-import-rerun:
-    name: AOT Build Import Rerun (${{ matrix.arch }}, ${{ matrix.cuda }})
-    needs: analyze-aot-failure
-    if: |
-      !cancelled() &&
-      needs.analyze-aot-failure.outputs.is_spot_termination == 'true' &&
-      needs.analyze-aot-failure.outputs.rerun_matrix != ''
-    runs-on:
-      - self-hosted
-      - Linux
-      - ${{ matrix.arch }}
-      - cpu
-      - on-demand
-    timeout-minutes: 360
-    strategy:
-      fail-fast: true
-      matrix: ${{ fromJSON(needs.analyze-aot-failure.outputs.rerun_matrix) }}
-    env:
-      DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ inputs.docker_tag }}
-    steps:
-      - name: Cleanup
-        run: |
-          docker stop $(docker ps -q) 2>/dev/null || true
-          docker rm $(docker ps -aq) 2>/dev/null || true
-          sudo rm -rf ${{ github.workspace }}/* || true
-          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
-          rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
-
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha }}
-          submodules: recursive
-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
-
-      - name: Show Node Info
-        run: ./scripts/task_show_node_info.sh
-        env:
-          NODE_NAME: ${{ runner.name }}
-          WORKSPACE: ${{ github.workspace }}
-          BUILD_NUMBER: ${{ github.run_number }}
-
-      - name: Run Test
-        run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh
-
-  gpu-tests-a10g:
-    name: JIT Unittest ${{ matrix.shard }} (A10G)
-    if: inputs.skip_gpu != 'true'
-    runs-on: [self-hosted, Linux, X64, gpu, sm86, spot]
-    timeout-minutes: 360
-    strategy:
-      fail-fast: true
-      matrix:
-        shard: [1, 2, 3, 4, 5]
-    env:
-      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }}
-    steps:
-      - name: Cleanup
-        run: |
-          docker stop $(docker ps -q) 2>/dev/null || true
-          docker rm $(docker ps -aq) 2>/dev/null || true
-          sudo rm -rf ${{ github.workspace }}/* || true
-          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
-          rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
-          nvidia-smi || true
-
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha }}
-          submodules: recursive
-
-      - name: Start spot termination monitor
-        run: ./scripts/task_monitor_spot.sh &
-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
-
-      - name: Show Node Info
-        run: ./scripts/task_show_node_info.sh
-        env:
-          NODE_NAME: ${{ runner.name }}
-          WORKSPACE: ${{ github.workspace }}
-          BUILD_NUMBER: ${{ github.run_number }}
-
-      - name: Run JIT Unittest Part ${{ matrix.shard }}
-        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh
-
-  analyze-gpu-a10g-failure:
-    name: Analyze GPU A10G Failure
-    needs: gpu-tests-a10g
-    if: "!cancelled() && inputs.skip_gpu != 'true' && (needs.gpu-tests-a10g.result == 'failure' || needs.gpu-tests-a10g.result == 'cancelled')"
-    runs-on: ubuntu-latest
-    outputs:
-      is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
-      rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }}
-    steps:
-      - name: Analyze failure from job logs
-        id: analyze
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          RUN_ID="${{ github.run_id }}"
-          SPOT_TERMINATION=false
-          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
-            --jq '.jobs[] | select(.name | contains("A10G")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
-          if [ -z "$FAILED_JOBS" ]; then
-            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
-            exit 0
-          fi
-          for JOB_ID in $FAILED_JOBS; do
-            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
-              continue
-            fi
-            if file job_log.zip | grep -q "Zip archive"; then
-              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
-            else
-              mv job_log.zip job_log.txt
-            fi
-            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
-              SPOT_TERMINATION=true
-              break
-            fi
-            if grep -qiE "connection reset by peer|context canceled|The operation was canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
-              SPOT_TERMINATION=true
-              break
-            fi
-          done
-          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
-
-      - name: Build rerun matrix
-        id: matrix
-        if: steps.analyze.outputs.is_spot_termination == 'true'
-        run: |
-          echo 'rerun_matrix={"include":[{"shard":1},{"shard":2},{"shard":3},{"shard":4},{"shard":5}]}' >> $GITHUB_OUTPUT
-
-  gpu-tests-a10g-rerun:
-    name: JIT Rerun ${{ matrix.shard }} (A10G)
-    needs: analyze-gpu-a10g-failure
-    if: |
-      !cancelled() &&
-      needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' &&
-      needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != ''
-    runs-on: [self-hosted, Linux, X64, gpu, sm86, on-demand]
-    timeout-minutes: 360
-    strategy:
-      fail-fast: true
-      matrix: ${{ fromJSON(needs.analyze-gpu-a10g-failure.outputs.rerun_matrix) }}
-    env:
-      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }}
-    steps:
-      - name: Cleanup
-        run: |
-          docker stop $(docker ps -q) 2>/dev/null || true
-          docker rm $(docker ps -aq) 2>/dev/null || true
-          sudo rm -rf ${{ github.workspace }}/* || true
-          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
-          rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
-          nvidia-smi || true
-
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha }}
-          submodules: recursive
-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
-
-      - name: Show Node Info
-        run: ./scripts/task_show_node_info.sh
-        env:
-          NODE_NAME: ${{ runner.name }}
-          WORKSPACE: ${{ github.workspace }}
-          BUILD_NUMBER: ${{ github.run_number }}
-
-      - name: Run JIT Unittest Part ${{ matrix.shard }}
-        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh
-
-  gpu-tests-t4:
-    name: JIT Unittest (T4)
-    if: inputs.skip_gpu != 'true'
-    runs-on: [self-hosted, Linux, X64, gpu, sm75, spot]
-    timeout-minutes: 360
-    env:
-      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }}
-    steps:
-      - name: Cleanup
-        run: |
-          docker stop $(docker ps -q) 2>/dev/null || true
-          docker rm $(docker ps -aq) 2>/dev/null || true
-          sudo rm -rf ${{ github.workspace }}/* || true
-          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
-          rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
-          nvidia-smi || true
-
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha }}
-          submodules: recursive
-
-      - name: Start spot termination monitor
-        run: ./scripts/task_monitor_spot.sh &
-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
-
-      - name: Show Node Info
-        run: ./scripts/task_show_node_info.sh
-        env:
-          NODE_NAME: ${{ runner.name }}
-          WORKSPACE: ${{ github.workspace }}
-          BUILD_NUMBER: ${{ github.run_number }}
-
-      - name: Run JIT Unittest Part 3 (T4)
-        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh
-
-  analyze-gpu-t4-failure:
-    name: Analyze GPU T4 Failure
-    needs: gpu-tests-t4
-    if: "!cancelled() && inputs.skip_gpu != 'true' && (needs.gpu-tests-t4.result == 'failure' || needs.gpu-tests-t4.result == 'cancelled')"
-    runs-on: ubuntu-latest
-    outputs:
-      is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
-    steps:
-      - name: Analyze failure from job logs
-        id: analyze
-        env:
-          GH_TOKEN: ${{ github.token }}
-        run: |
-          RUN_ID="${{ github.run_id }}"
-          SPOT_TERMINATION=false
-          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
-            --jq '.jobs[] | select(.name | contains("T4")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
-          if [ -z "$FAILED_JOBS" ]; then
-            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
-            exit 0
-          fi
-          for JOB_ID in $FAILED_JOBS; do
-            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
-              continue
-            fi
-            if file job_log.zip | grep -q "Zip archive"; then
-              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
-            else
-              mv job_log.zip job_log.txt
-            fi
-            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
-              SPOT_TERMINATION=true
-              break
-            fi
-            if grep -qiE "connection reset by peer|context canceled|The operation was canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
-              SPOT_TERMINATION=true
-              break
-            fi
-          done
-          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
-
-  gpu-tests-t4-rerun:
-    name: JIT Rerun (T4)
-    needs: analyze-gpu-t4-failure
-    if: |
-      !cancelled() &&
-      needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true'
-    runs-on: [self-hosted, Linux, X64, gpu, sm75, on-demand]
-    timeout-minutes: 360
-    env:
-      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ inputs.docker_tag }}
-    steps:
-      - name: Cleanup
-        run: |
-          docker stop $(docker ps -q) 2>/dev/null || true
-          docker rm $(docker ps -aq) 2>/dev/null || true
-          sudo rm -rf ${{ github.workspace }}/* || true
-          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
-          rm -rf ~/.cache/flashinfer_jit || true
-          docker system prune -f || true
-          nvidia-smi || true
-
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha }}
-          submodules: recursive
-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: flashinfer
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true
-
-      - name: Show Node Info
-        run: ./scripts/task_show_node_info.sh
-        env:
-          NODE_NAME: ${{ runner.name }}
-          WORKSPACE: ${{ github.workspace }}
-          BUILD_NUMBER: ${{ github.run_number }}
-
-      - name: Run JIT Unittest Part 3 (T4)
-        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh
-
-  update-check-runs:
-    name: Update Check Runs
-    if: always()
-    needs:
-      - aot-build-import
-      - analyze-aot-failure
-      - aot-build-import-rerun
-      - gpu-tests-a10g
-      - analyze-gpu-a10g-failure
-      - gpu-tests-a10g-rerun
-      - gpu-tests-t4
-      - analyze-gpu-t4-failure
-      - gpu-tests-t4-rerun
-    runs-on: ubuntu-latest
-    steps:
-      - name: Generate GitHub App Token
-        id: app-token
-        uses: actions/create-github-app-token@v1
-        with:
-          app-id: ${{ secrets.GH_APP_ID }}
-          private-key: ${{ secrets.GH_APP_KEY }}
-          owner: flashinfer-ai
-          repositories: flashinfer
-
-      - name: Update AOT Check Run
-        if: inputs.aot_check_id != '' && inputs.skip_aot != 'true'
-        env:
-          GH_TOKEN: ${{ steps.app-token.outputs.token }}
-          REPO: ${{ github.repository }}
-          CHECK_ID: ${{ inputs.aot_check_id }}
-          AOT_SPOT: ${{ needs.aot-build-import.result }}
-          AOT_SPOT_TERM: ${{ needs.analyze-aot-failure.outputs.is_spot_termination }}
-          AOT_RERUN: ${{ needs.aot-build-import-rerun.result }}
-        run: |
-          if [ "$AOT_SPOT" == "success" ]; then
-            CONCLUSION="success"
-            TITLE="AOT Build Tests Passed"
-            SUMMARY="All AOT build tests passed on spot instances."
-          elif [ "$AOT_SPOT_TERM" == "true" ] && [ "$AOT_RERUN" == "success" ]; then
-            CONCLUSION="success"
-            TITLE="AOT Build Tests Passed (rerun)"
-            SUMMARY="Spot instance was terminated. Rerun on on-demand instances passed."
-          else
-            CONCLUSION="failure"
-            TITLE="AOT Build Tests Failed"
-            SUMMARY="AOT build tests failed. Check the workflow logs for details."
-          fi
-
-          gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \
-            -f status="completed" \
-            -f conclusion="$CONCLUSION" \
-            -F output[title]="$TITLE" \
-            -F output[summary]="$SUMMARY"
-
-      - name: Update GPU A10G Check Run
-        if: inputs.gpu_a10g_check_id != '' && inputs.skip_gpu != 'true'
-        env:
-          GH_TOKEN: ${{ steps.app-token.outputs.token }}
-          REPO: ${{ github.repository }}
-          CHECK_ID: ${{ inputs.gpu_a10g_check_id }}
-          GPU_SPOT: ${{ needs.gpu-tests-a10g.result }}
-          GPU_SPOT_TERM: ${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }}
-          GPU_RERUN: ${{ needs.gpu-tests-a10g-rerun.result }}
-        run: |
-          if [ "$GPU_SPOT" == "success" ]; then
-            CONCLUSION="success"
-            TITLE="JIT Unittest (A10G) Passed"
-            SUMMARY="All JIT unittest passed on A10G spot instances."
-          elif [ "$GPU_SPOT_TERM" == "true" ] && [ "$GPU_RERUN" == "success" ]; then
-            CONCLUSION="success"
-            TITLE="JIT Unittest (A10G) Passed (rerun)"
-            SUMMARY="Spot instance was terminated. Rerun on on-demand A10G instances passed."
-          else
-            CONCLUSION="failure"
-            TITLE="JIT Unittest (A10G) Failed"
-            SUMMARY="JIT unittest on A10G failed. Check the workflow logs for details."
-          fi
-
-          gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \
-            -f status="completed" \
-            -f conclusion="$CONCLUSION" \
-            -F output[title]="$TITLE" \
-            -F output[summary]="$SUMMARY"
-
-      - name: Update GPU T4 Check Run
-        if: inputs.gpu_t4_check_id != '' && inputs.skip_gpu != 'true'
-        env:
-          GH_TOKEN: ${{ steps.app-token.outputs.token }}
-          REPO: ${{ github.repository }}
-          CHECK_ID: ${{ inputs.gpu_t4_check_id }}
-          GPU_SPOT: ${{ needs.gpu-tests-t4.result }}
-          GPU_SPOT_TERM: ${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}
-          GPU_RERUN: ${{ needs.gpu-tests-t4-rerun.result }}
-        run: |
-          if [ "$GPU_SPOT" == "success" ]; then
-            CONCLUSION="success"
-            TITLE="JIT Unittest (T4) Passed"
-            SUMMARY="All JIT unittest passed on T4 spot instances."
-          elif [ "$GPU_SPOT_TERM" == "true" ] && [ "$GPU_RERUN" == "success" ]; then
-            CONCLUSION="success"
-            TITLE="JIT Unittest (T4) Passed (rerun)"
-            SUMMARY="Spot instance was terminated. Rerun on on-demand T4 instances passed."
-          else
-            CONCLUSION="failure"
-            TITLE="JIT Unittest (T4) Failed"
-            SUMMARY="JIT unittest on T4 failed. Check the workflow logs for details."
-          fi
-
-          gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \
-            -f status="completed" \
-            -f conclusion="$CONCLUSION" \
-            -F output[title]="$TITLE" \
-            -F output[summary]="$SUMMARY"
-
-      - name: Update Test Results Summary
-        if: inputs.summary_check_id != ''
-        env:
-          GH_TOKEN: ${{ steps.app-token.outputs.token }}
-          REPO: ${{ github.repository }}
-          CHECK_ID: ${{ inputs.summary_check_id }}
-          AOT_SPOT: ${{ needs.aot-build-import.result }}
-          AOT_RERUN: ${{ needs.aot-build-import-rerun.result }}
-          AOT_SPOT_TERM: ${{ needs.analyze-aot-failure.outputs.is_spot_termination }}
-          GPU_A10G_SPOT: ${{ needs.gpu-tests-a10g.result }}
-          GPU_A10G_RERUN: ${{ needs.gpu-tests-a10g-rerun.result }}
-          GPU_A10G_SPOT_TERM: ${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }}
-          GPU_T4_SPOT: ${{ needs.gpu-tests-t4.result }}
-          GPU_T4_RERUN: ${{ needs.gpu-tests-t4-rerun.result }}
-          GPU_T4_SPOT_TERM: ${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}
-          SKIP_AOT: ${{ inputs.skip_aot }}
-          SKIP_GPU: ${{ inputs.skip_gpu }}
-        run: |
-          ALL_PASSED=true
-          SUMMARY_LINES=""
-
-          if [ "$SKIP_AOT" != "true" ]; then
-            if [ "$AOT_SPOT" == "success" ]; then
-              SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Passed\n"
-            elif [ "$AOT_SPOT_TERM" == "true" ] && [ "$AOT_RERUN" == "success" ]; then
-              SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Passed (rerun after spot termination)\n"
-            else
-              SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Failed\n"
-              ALL_PASSED=false
-            fi
-          else
-            SUMMARY_LINES="${SUMMARY_LINES}- AOT Build Tests: Skipped\n"
-          fi
-
-          if [ "$SKIP_GPU" != "true" ]; then
-            if [ "$GPU_A10G_SPOT" == "success" ]; then
-              SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Passed\n"
-            elif [ "$GPU_A10G_SPOT_TERM" == "true" ] && [ "$GPU_A10G_RERUN" == "success" ]; then
-              SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Passed (rerun after spot termination)\n"
-            else
-              SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Failed\n"
-              ALL_PASSED=false
-            fi
-
-            if [ "$GPU_T4_SPOT" == "success" ]; then
-              SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Passed\n"
-            elif [ "$GPU_T4_SPOT_TERM" == "true" ] && [ "$GPU_T4_RERUN" == "success" ]; then
-              SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Passed (rerun after spot termination)\n"
-            else
-              SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Failed\n"
-              ALL_PASSED=false
-            fi
-          else
-            SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (A10G): Skipped\n"
-            SUMMARY_LINES="${SUMMARY_LINES}- JIT Unittest (T4): Skipped\n"
-          fi
-
-          if [ "$ALL_PASSED" == "true" ]; then
-            CONCLUSION="success"
-            TITLE="All tests passed"
-          else
-            CONCLUSION="failure"
-            TITLE="Some tests failed"
-          fi
-
-          SUMMARY=$(printf '%b' "$SUMMARY_LINES")
-
-          gh api -X PATCH "repos/${REPO}/check-runs/${CHECK_ID}" \
-            -f status="completed" \
-            -f conclusion="$CONCLUSION" \
-            -F output[title]="$TITLE" \
-            -F output[summary]="$SUMMARY"
-
-          echo "Updated Test Results Summary: $CONCLUSION"
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 29ac6a09a2..6e235d5e28 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -1,7 +1,20 @@
-# PR Test Gateway - Creates check runs and triggers test runner
+# CI workflow using AWS self-hosted runners.
+# Runs AOT build tests and GPU unit tests on push/PR to main.
+# Uses ci/bash.sh for Docker execution (same as Jenkins).
 #
-# Creates custom check runs via GitHub App, then dispatches tests to
-# pr-test-runner.yml. Failed spot attempts are hidden from PR status.
+# Permission Control:
+# - Push to main: Always runs
+# - PR from org members (ci-users team): Runs automatically
+# - PR from external contributors: Requires 'run-ci' label
+#   (added via @flashinfer-bot run command from authorized user)
+#
+# Rerun Strategy:
+# - Spot jobs run with fail-fast: true
+# - Background monitor checks AWS metadata for spot termination notice
+# - If termination detected, writes marker to log (captured by GitHub)
+# - Analyze job checks logs for marker to decide if should rerun
+# - Spot termination: rerun all failed/cancelled jobs on on-demand
+# - Real failure: no rerun, workflow fails fast
 
 name: PR Test
 
@@ -29,12 +42,15 @@ concurrency:
 permissions:
   contents: read
   pull-requests: write
-  actions: write
+  actions: read
 
 env:
   EXECUTOR_NUMBER: "0"
 
 jobs:
+  # ---------------------------------------------------------------------------
+  # Gate - Check if PR is authorized to run CI
+  # ---------------------------------------------------------------------------
   gate:
     name: Permission Check
     runs-on: ubuntu-latest
@@ -101,6 +117,9 @@ jobs:
             echo "$AUTHOR is not a member of $TEAM, not authorized"
           fi
 
+  # ---------------------------------------------------------------------------
+  # Setup - Read docker tag and check if build should be skipped
+  # ---------------------------------------------------------------------------
   setup:
     name: Setup
     needs: gate
@@ -109,21 +128,11 @@ jobs:
     outputs:
       docker_tag: ${{ steps.get-tag.outputs.tag }}
       skip_build: ${{ steps.check.outputs.skip }}
-      head_sha: ${{ steps.get-sha.outputs.sha }}
     steps:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
-      - name: Get HEAD SHA
-        id: get-sha
-        run: |
-          if [ "${{ github.event_name }}" == "pull_request" ]; then
-            echo "sha=${{ github.event.pull_request.head.sha }}" >> $GITHUB_OUTPUT
-          else
-            echo "sha=${{ github.sha }}" >> $GITHUB_OUTPUT
-          fi
-
       - name: Get Docker Tag
         id: get-tag
         run: |
@@ -163,140 +172,558 @@ jobs:
             echo "::notice::Skipping build - only docs/config files changed"
           fi
 
-  orchestrator:
-    name: Orchestrate Tests
+  # ---------------------------------------------------------------------------
+  # AOT Build Import Tests (Spot + On-Demand Rerun)
+  # ---------------------------------------------------------------------------
+  aot-build-import:
+    name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }})
     needs: [gate, setup]
     if: |
       needs.gate.outputs.authorized == 'true' &&
-      needs.setup.outputs.skip_build != 'true'
-    runs-on: ubuntu-latest
+      needs.setup.outputs.skip_build != 'true' &&
+      github.event.inputs.skip_aot != 'true'
+    runs-on:
+      - self-hosted
+      - Linux
+      - ${{ matrix.arch }}
+      - cpu
+      - spot
+    timeout-minutes: 360
+    strategy:
+      fail-fast: true
+      matrix:
+        arch: [X64, ARM64]
+        cuda: [cu126, cu128, cu129, cu130]
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }}
     steps:
-      - name: Generate Token (flashinfer)
-        id: flashinfer-token
-        uses: actions/create-github-app-token@v1
+      - name: Cleanup
+        run: |
+          # Stop all Docker containers to free memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+
+      - uses: actions/checkout@v4
         with:
-          app-id: ${{ secrets.GH_APP_ID }}
-          private-key: ${{ secrets.GH_APP_KEY }}
-          owner: flashinfer-ai
-          repositories: flashinfer
-
-      - name: Generate Token (ci-infra)
-        id: ci-infra-token
-        uses: actions/create-github-app-token@v1
+          submodules: recursive
+
+      - name: Start spot termination monitor
+        run: ./scripts/task_monitor_spot.sh &
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
         with:
-          app-id: ${{ secrets.GH_APP_ID }}
-          private-key: ${{ secrets.GH_APP_KEY }}
-          owner: flashinfer-ai
-          repositories: ci-infra
-
-      - name: Create Check Runs (PR only)
-        id: create-checks
-        if: github.event_name == 'pull_request'
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
+
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
+        env:
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run Test
+        run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh
+
+  analyze-aot-failure:
+    name: Analyze AOT Failure
+    needs: [setup, aot-build-import]
+    if: "!cancelled() && (needs.aot-build-import.result == 'failure' || needs.aot-build-import.result == 'cancelled')"
+    runs-on: ubuntu-latest
+    outputs:
+      is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
+      rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }}
+    steps:
+      - name: Analyze failure from job logs
+        id: analyze
         env:
-          GH_TOKEN: ${{ steps.flashinfer-token.outputs.token }}
+          GH_TOKEN: ${{ github.token }}
         run: |
-          SHA="${{ needs.setup.outputs.head_sha }}"
-          REPO="${{ github.repository }}"
-          RUNNER_URL="https://github.com/flashinfer-ai/ci-infra/actions/workflows/pr-test-runner.yml"
-
-          if [ "${{ github.event.inputs.skip_aot }}" != "true" ]; then
-            AOT_CHECK=$(gh api repos/$REPO/check-runs \
-              -f name="AOT Build Tests" \
-              -f head_sha="$SHA" \
-              -f status="in_progress" \
-              -F output[title]="In progress" \
-              -F output[summary]="Running AOT build tests: [view test runs]($RUNNER_URL)" \
-              --jq '.id')
-            echo "aot_check_id=$AOT_CHECK" >> $GITHUB_OUTPUT
+          RUN_ID="${{ github.run_id }}"
+          SPOT_TERMINATION=false
+          # Include both failed and cancelled jobs (spot termination can cause either)
+          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
+            --jq '.jobs[] | select(.name | startswith("AOT")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
+          if [ -z "$FAILED_JOBS" ]; then
+            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
+            exit 0
           fi
+          for JOB_ID in $FAILED_JOBS; do
+            # Download logs (may be ZIP or plain text depending on GitHub API)
+            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
+              continue
+            fi
+            # Try to unzip if it's a ZIP file, otherwise use as-is
+            if file job_log.zip | grep -q "Zip archive"; then
+              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
+            else
+              mv job_log.zip job_log.txt
+            fi
+            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
+              echo "Detected: AWS spot termination marker (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+            if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
+              echo "Detected: infrastructure error pattern (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+          done
+          echo "is_spot_termination=$SPOT_TERMINATION"
+          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
 
-          if [ "${{ github.event.inputs.skip_gpu }}" != "true" ]; then
-            A10G_CHECK=$(gh api repos/$REPO/check-runs \
-              -f name="JIT Unittest (A10G)" \
-              -f head_sha="$SHA" \
-              -f status="in_progress" \
-              -F output[title]="In progress" \
-              -F output[summary]="Running JIT unittests on A10G instances: [view test runs]($RUNNER_URL)" \
-              --jq '.id')
-            echo "gpu_a10g_check_id=$A10G_CHECK" >> $GITHUB_OUTPUT
-
-            T4_CHECK=$(gh api repos/$REPO/check-runs \
-              -f name="JIT Unittest (T4)" \
-              -f head_sha="$SHA" \
-              -f status="in_progress" \
-              -F output[title]="In progress" \
-              -F output[summary]="Running JIT unittests on T4 instances: [view test runs]($RUNNER_URL)" \
-              --jq '.id')
-            echo "gpu_t4_check_id=$T4_CHECK" >> $GITHUB_OUTPUT
-          fi
+      - name: Build rerun matrix
+        id: matrix
+        if: steps.analyze.outputs.is_spot_termination == 'true'
+        run: |
+          MATRIX='{"include":['
+          for arch in X64 ARM64; do
+            for cuda in cu126 cu128 cu129 cu130; do
+              MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},'
+            done
+          done
+          MATRIX="${MATRIX%,}]}"
+          echo "rerun_matrix=$MATRIX" >> $GITHUB_OUTPUT
+
+  aot-build-import-rerun:
+    name: AOT Build Import Rerun (${{ matrix.arch }}, ${{ matrix.cuda }})
+    needs: [setup, analyze-aot-failure]
+    if: |
+      !cancelled() &&
+      needs.analyze-aot-failure.outputs.is_spot_termination == 'true' &&
+      needs.analyze-aot-failure.outputs.rerun_matrix != ''
+    runs-on:
+      - self-hosted
+      - Linux
+      - ${{ matrix.arch }}
+      - cpu
+      - on-demand
+    timeout-minutes: 360
+    strategy:
+      fail-fast: true
+      matrix: ${{ fromJSON(needs.analyze-aot-failure.outputs.rerun_matrix) }}
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }}
+    steps:
+      - name: Cleanup
+        run: |
+          # Stop all Docker containers to free memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
-          SUMMARY_CHECK=$(gh api repos/$REPO/check-runs \
-            -f name="Test Results Summary" \
-            -f head_sha="$SHA" \
-            -f status="in_progress" \
-            -F output[title]="In progress" \
-            -F output[summary]="Waiting for test results: [view test runs]($RUNNER_URL)" \
-            --jq '.id')
-          echo "summary_check_id=$SUMMARY_CHECK" >> $GITHUB_OUTPUT
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
 
-      - name: Trigger Test Runner
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
         env:
-          GH_ACTION_TOKEN: ${{ github.token }}
-          CI_INFRA_TOKEN: ${{ steps.ci-infra-token.outputs.token }}
-          HEAD_SHA: ${{ needs.setup.outputs.head_sha }}
-          DOCKER_TAG: ${{ needs.setup.outputs.docker_tag }}
-          AOT_CHECK_ID: ${{ steps.create-checks.outputs.aot_check_id || '' }}
-          GPU_A10G_CHECK_ID: ${{ steps.create-checks.outputs.gpu_a10g_check_id || '' }}
-          GPU_T4_CHECK_ID: ${{ steps.create-checks.outputs.gpu_t4_check_id || '' }}
-          SUMMARY_CHECK_ID: ${{ steps.create-checks.outputs.summary_check_id || '' }}
-          SKIP_AOT: ${{ github.event.inputs.skip_aot || 'false' }}
-          SKIP_GPU: ${{ github.event.inputs.skip_gpu || 'false' }}
-          CONCURRENCY_KEY: pr-test-${{ github.ref }}
-          DISPATCH_REF: ${{ github.head_ref || github.ref_name }}
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run Test
+        run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh
+
+  # ---------------------------------------------------------------------------
+  # GPU JIT Tests - SM86 (A10G) - Spot + On-Demand Rerun
+  # ---------------------------------------------------------------------------
+  gpu-tests-a10g:
+    name: JIT Unittest ${{ matrix.shard }} (A10G)
+    needs: [gate, setup]
+    if: |
+      needs.gate.outputs.authorized == 'true' &&
+      needs.setup.outputs.skip_build != 'true' &&
+      github.event.inputs.skip_gpu != 'true'
+    runs-on: [self-hosted, Linux, X64, gpu, sm86, spot]
+    timeout-minutes: 360
+    strategy:
+      fail-fast: true
+      matrix:
+        shard: [1, 2, 3, 4, 5]
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
+    steps:
+      - name: Cleanup
         run: |
-          # Try workflow_dispatch first (works after pr-test-runner.yml is on main)
-          # Uses GITHUB_TOKEN (has actions:write) - App token doesn't have Actions permission
-          # --ref uses PR branch (to test PR changes to runner) or main (for push)
-          if GH_TOKEN="$GH_ACTION_TOKEN" gh workflow run pr-test-runner.yml \
-            --repo "${{ github.repository }}" \
-            --ref "$DISPATCH_REF" \
-            -f pr_head_sha="$HEAD_SHA" \
-            -f docker_tag="$DOCKER_TAG" \
-            -f aot_check_id="$AOT_CHECK_ID" \
-            -f gpu_a10g_check_id="$GPU_A10G_CHECK_ID" \
-            -f gpu_t4_check_id="$GPU_T4_CHECK_ID" \
-            -f summary_check_id="$SUMMARY_CHECK_ID" \
-            -f skip_aot="$SKIP_AOT" \
-            -f skip_gpu="$SKIP_GPU" \
-            -f concurrency_key="$CONCURRENCY_KEY" 2>/dev/null; then
-            echo "Triggered via workflow_dispatch (flashinfer)"
-          else
-            # Fallback: repository_dispatch to ci-infra (bootstrap)
-            GH_TOKEN="$CI_INFRA_TOKEN" gh api repos/flashinfer-ai/ci-infra/dispatches \
-              -f event_type="run-pr-test" \
-              -f client_payload[pr_head_sha]="$HEAD_SHA" \
-              -f client_payload[docker_tag]="$DOCKER_TAG" \
-              -f client_payload[aot_check_id]="$AOT_CHECK_ID" \
-              -f client_payload[gpu_a10g_check_id]="$GPU_A10G_CHECK_ID" \
-              -f client_payload[gpu_t4_check_id]="$GPU_T4_CHECK_ID" \
-              -f client_payload[summary_check_id]="$SUMMARY_CHECK_ID" \
-              -f client_payload[skip_aot]="$SKIP_AOT" \
-              -f client_payload[skip_gpu]="$SKIP_GPU" \
-              -f client_payload[concurrency_key]="$CONCURRENCY_KEY"
-            echo "Triggered via repository_dispatch (ci-infra bootstrap)"
+          # Stop all Docker containers to free memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+          nvidia-smi || true
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Start spot termination monitor
+        run: ./scripts/task_monitor_spot.sh &
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
+
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
+        env:
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run JIT Unittest Part ${{ matrix.shard }}
+        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh
+
+  analyze-gpu-a10g-failure:
+    name: Analyze GPU A10G Failure
+    needs: [setup, gpu-tests-a10g]
+    if: "!cancelled() && (needs.gpu-tests-a10g.result == 'failure' || needs.gpu-tests-a10g.result == 'cancelled')"
+    runs-on: ubuntu-latest
+    outputs:
+      is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
+      rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }}
+    steps:
+      - name: Analyze failure from job logs
+        id: analyze
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          RUN_ID="${{ github.run_id }}"
+          SPOT_TERMINATION=false
+          # Include both failed and cancelled jobs (spot termination can cause either)
+          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
+            --jq '.jobs[] | select(.name | contains("A10G")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
+          if [ -z "$FAILED_JOBS" ]; then
+            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
+            exit 0
           fi
+          for JOB_ID in $FAILED_JOBS; do
+            # Download logs (may be ZIP or plain text depending on GitHub API)
+            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
+              continue
+            fi
+            # Try to unzip if it's a ZIP file, otherwise use as-is
+            if file job_log.zip | grep -q "Zip archive"; then
+              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
+            else
+              mv job_log.zip job_log.txt
+            fi
+            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
+              echo "Detected: AWS spot termination marker (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+            if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
+              echo "Detected: infrastructure error pattern (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+          done
+          echo "is_spot_termination=$SPOT_TERMINATION"
+          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
 
-  report-unauthorized:
-    name: Report Unauthorized
-    needs: gate
-    if: github.event_name == 'pull_request' && needs.gate.outputs.authorized != 'true'
+      - name: Build rerun matrix
+        id: matrix
+        if: steps.analyze.outputs.is_spot_termination == 'true'
+        run: |
+          echo 'rerun_matrix={"include":[{"shard":1},{"shard":2},{"shard":3},{"shard":4},{"shard":5}]}' >> $GITHUB_OUTPUT
+
+  gpu-tests-a10g-rerun:
+    name: JIT Rerun ${{ matrix.shard }} (A10G)
+    needs: [setup, analyze-gpu-a10g-failure]
+    if: |
+      !cancelled() &&
+      needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' &&
+      needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != ''
+    runs-on: [self-hosted, Linux, X64, gpu, sm86, on-demand]
+    timeout-minutes: 360
+    strategy:
+      fail-fast: true
+      matrix: ${{ fromJSON(needs.analyze-gpu-a10g-failure.outputs.rerun_matrix) }}
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
+    steps:
+      - name: Cleanup
+        run: |
+          # Stop all Docker containers to free memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+          nvidia-smi || true
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
+
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
+        env:
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run JIT Unittest Part ${{ matrix.shard }}
+        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh
+
+  # ---------------------------------------------------------------------------
+  # GPU JIT Tests - SM75 (T4) - Spot + On-Demand Rerun
+  # ---------------------------------------------------------------------------
+  gpu-tests-t4:
+    name: JIT Unittest (T4)
+    needs: [gate, setup]
+    if: |
+      needs.gate.outputs.authorized == 'true' &&
+      needs.setup.outputs.skip_build != 'true' &&
+      github.event.inputs.skip_gpu != 'true'
+    runs-on: [self-hosted, Linux, X64, gpu, sm75, spot]
+    timeout-minutes: 360
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
+    steps:
+      - name: Cleanup
+        run: |
+          # Stop all Docker containers to free memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+          nvidia-smi || true
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Start spot termination monitor
+        run: ./scripts/task_monitor_spot.sh &
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
+
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
+        env:
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run JIT Unittest Part 3 (T4)
+        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh
+
+  analyze-gpu-t4-failure:
+    name: Analyze GPU T4 Failure
+    needs: [setup, gpu-tests-t4]
+    if: "!cancelled() && (needs.gpu-tests-t4.result == 'failure' || needs.gpu-tests-t4.result == 'cancelled')"
     runs-on: ubuntu-latest
+    outputs:
+      is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
     steps:
-      - name: Post Comment
+      - name: Analyze failure from job logs
+        id: analyze
         env:
           GH_TOKEN: ${{ github.token }}
         run: |
-          echo "## CI Authorization Required" >> $GITHUB_STEP_SUMMARY
+          RUN_ID="${{ github.run_id }}"
+          SPOT_TERMINATION=false
+          # Include both failed and cancelled jobs (spot termination can cause either)
+          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
+            --jq '.jobs[] | select(.name | contains("T4")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
+          if [ -z "$FAILED_JOBS" ]; then
+            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          for JOB_ID in $FAILED_JOBS; do
+            # Download logs (may be ZIP or plain text depending on GitHub API)
+            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
+              continue
+            fi
+            # Try to unzip if it's a ZIP file, otherwise use as-is
+            if file job_log.zip | grep -q "Zip archive"; then
+              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
+            else
+              mv job_log.zip job_log.txt
+            fi
+            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
+              echo "Detected: AWS spot termination marker (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+            if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
+              echo "Detected: infrastructure error pattern (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+          done
+          echo "is_spot_termination=$SPOT_TERMINATION"
+          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
+
+  gpu-tests-t4-rerun:
+    name: JIT Rerun (T4)
+    needs: [setup, analyze-gpu-t4-failure]
+    if: |
+      !cancelled() &&
+      needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true'
+    runs-on: [self-hosted, Linux, X64, gpu, sm75, on-demand]
+    timeout-minutes: 360
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
+    steps:
+      - name: Cleanup
+        run: |
+          # Stop all Docker containers to free memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+          nvidia-smi || true
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
+
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
+        env:
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run JIT Unittest Part 3 (T4)
+        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh
+
+  # ---------------------------------------------------------------------------
+  # Test Results Summary
+  # ---------------------------------------------------------------------------
+  test-results-summary:
+    name: Test Results Summary
+    if: "!cancelled()"
+    needs:
+      - gate
+      - setup
+      - aot-build-import
+      - analyze-aot-failure
+      - aot-build-import-rerun
+      - gpu-tests-a10g
+      - analyze-gpu-a10g-failure
+      - gpu-tests-a10g-rerun
+      - gpu-tests-t4
+      - analyze-gpu-t4-failure
+      - gpu-tests-t4-rerun
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check Results
+        run: |
+          echo "## Test Results Summary" >> $GITHUB_STEP_SUMMARY
+
+          # Check if CI was skipped due to permissions
+          if [ "${{ needs.gate.outputs.authorized }}" != "true" ]; then
+            echo "CI skipped (pending authorization)" >> $GITHUB_STEP_SUMMARY
+            echo "A contributor in @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY
+            exit 0
+          fi
+          # Helper function to check job status
+          check_status() {
+            local name=$1 skip=$2 spot=$3 spot_term=$4 rerun=$5
+            echo "$name" >> $GITHUB_STEP_SUMMARY
+            if [ "$skip" == "true" ]; then
+              echo "- Status: Skipped" >> $GITHUB_STEP_SUMMARY
+            elif [ "$spot" == "success" ]; then
+              echo "- Status: Passed (spot)" >> $GITHUB_STEP_SUMMARY
+            elif [ "$spot_term" == "true" ] && [ "$rerun" == "success" ]; then
+              echo "- Status: Passed (on-demand rerun)" >> $GITHUB_STEP_SUMMARY
+            else
+              echo "- Status: Failed" >> $GITHUB_STEP_SUMMARY
+              return 1
+            fi
+            return 0
+          }
+
+          echo "Test Results Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+
+          if [ "${{ needs.setup.outputs.skip_build }}" == "true" ]; then
+            echo "Build skipped (docs/config only changes)" >> $GITHUB_STEP_SUMMARY
+            exit 0
+          fi
+
+          FAILED=false
+
+          check_status "AOT Build Import Tests" \
+            "${{ github.event.inputs.skip_aot }}" \
+            "${{ needs.aot-build-import.result }}" \
+            "${{ needs.analyze-aot-failure.outputs.is_spot_termination }}" \
+            "${{ needs.aot-build-import-rerun.result }}" || FAILED=true
+
+          echo "" >> $GITHUB_STEP_SUMMARY
+          check_status "GPU Tests (A10G)" \
+            "${{ github.event.inputs.skip_gpu }}" \
+            "${{ needs.gpu-tests-a10g.result }}" \
+            "${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }}" \
+            "${{ needs.gpu-tests-a10g-rerun.result }}" || FAILED=true
+
           echo "" >> $GITHUB_STEP_SUMMARY
-          echo "This PR requires authorization to run CI." >> $GITHUB_STEP_SUMMARY
-          echo "A member of @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY
+          check_status "GPU Tests (T4)" \
+            "${{ github.event.inputs.skip_gpu }}" \
+            "${{ needs.gpu-tests-t4.result }}" \
+            "${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \
+            "${{ needs.gpu-tests-t4-rerun.result }}" || FAILED=true
+
+          echo "" >> $GITHUB_STEP_SUMMARY
+          if [ "$FAILED" == "true" ]; then
+            echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY
+            exit 1
+          fi
+          echo "Result: Tests Passed" >> $GITHUB_STEP_SUMMARY