diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index ee04965583..6e235d5e28 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -7,6 +7,14 @@
 # - PR from org members (ci-users team): Runs automatically
 # - PR from external contributors: Requires 'run-ci' label
 #   (added via @flashinfer-bot run command from authorized user)
+#
+# Rerun Strategy:
+# - Spot jobs run with fail-fast: true
+# - Background monitor checks AWS metadata for spot termination notice
+# - If termination detected, writes marker to log (captured by GitHub)
+# - Analyze job checks logs for marker to decide if should rerun
+# - Spot termination: rerun all failed/cancelled jobs on on-demand
+# - Real failure: no rerun, workflow fails fast
 
 name: PR Test
 
@@ -34,6 +42,7 @@ concurrency:
 permissions:
   contents: read
   pull-requests: write
+  actions: read
 
 env:
   EXECUTOR_NUMBER: "0"
@@ -164,8 +173,7 @@ jobs:
           fi
 
   # ---------------------------------------------------------------------------
-  # AOT Build Import Tests - x86_64 and aarch64 (multiple CUDA versions)
-  # Uses ci/bash.sh with --no-gpu (same as Jenkins)
+  # AOT Build Import Tests (Spot + On-Demand Rerun)
   # ---------------------------------------------------------------------------
   aot-build-import:
     name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }})
@@ -179,9 +187,10 @@ jobs:
       - Linux
       - ${{ matrix.arch }}
       - cpu
+      - spot
     timeout-minutes: 360
     strategy:
-      fail-fast: false
+      fail-fast: true
       matrix:
         arch: [X64, ARM64]
         cuda: [cu126, cu128, cu129, cu130]
@@ -203,12 +212,128 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Start spot termination monitor
+        run: ./scripts/task_monitor_spot.sh &
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
+
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
+        env:
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run Test
+        run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh
+
+  analyze-aot-failure:
+    name: Analyze AOT Failure
+    needs: [setup, aot-build-import]
+    if: "!cancelled() && (needs.aot-build-import.result == 'failure' || needs.aot-build-import.result == 'cancelled')"
+    runs-on: ubuntu-latest
+    outputs:
+      is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
+      rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }}
+    steps:
+      - name: Analyze failure from job logs
+        id: analyze
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          RUN_ID="${{ github.run_id }}"
+          SPOT_TERMINATION=false
+          # Include both failed and cancelled jobs (spot termination can cause either)
+          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
+            --jq '.jobs[] | select(.name | startswith("AOT")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
+          if [ -z "$FAILED_JOBS" ]; then
+            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          for JOB_ID in $FAILED_JOBS; do
+            # Download logs (may be ZIP or plain text depending on GitHub API)
+            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
+              continue
+            fi
+            # Try to unzip if it's a ZIP file, otherwise use as-is
+            if file job_log.zip | grep -q "Zip archive"; then
+              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
+            else
+              mv job_log.zip job_log.txt
+            fi
+            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
+              echo "Detected: AWS spot termination marker (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+            if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
+              echo "Detected: infrastructure error pattern (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+          done
+          echo "is_spot_termination=$SPOT_TERMINATION"
+          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
+
+      - name: Build rerun matrix
+        id: matrix
+        if: steps.analyze.outputs.is_spot_termination == 'true'
+        run: |
+          MATRIX='{"include":['
+          for arch in X64 ARM64; do
+            for cuda in cu126 cu128 cu129 cu130; do
+              MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},'
+            done
+          done
+          MATRIX="${MATRIX%,}]}"
+          echo "rerun_matrix=$MATRIX" >> $GITHUB_OUTPUT
+
+  aot-build-import-rerun:
+    name: AOT Build Import Rerun (${{ matrix.arch }}, ${{ matrix.cuda }})
+    needs: [setup, analyze-aot-failure]
+    if: |
+      !cancelled() &&
+      needs.analyze-aot-failure.outputs.is_spot_termination == 'true' &&
+      needs.analyze-aot-failure.outputs.rerun_matrix != ''
+    runs-on:
+      - self-hosted
+      - Linux
+      - ${{ matrix.arch }}
+      - cpu
+      - on-demand
+    timeout-minutes: 360
+    strategy:
+      fail-fast: true
+      matrix: ${{ fromJSON(needs.analyze-aot-failure.outputs.rerun_matrix) }}
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }}
+    steps:
+      - name: Cleanup
+        run: |
+          # Stop all Docker containers to free memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
       - name: Login to Docker Hub
         uses: docker/login-action@v3
         with:
           username: flashinfer
           password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true  # Don't fail if secret is unavailable (e.g., fork PRs)
+        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
@@ -217,12 +342,11 @@ jobs:
           WORKSPACE: ${{ github.workspace }}
           BUILD_NUMBER: ${{ github.run_number }}
 
-      - name: Test JIT Cache Package Build and Import
+      - name: Run Test
         run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh
 
   # ---------------------------------------------------------------------------
-  # GPU JIT Tests - SM86 (A10G) - 5 Shards
-  # Uses ci/bash.sh with GPU (same as Jenkins)
+  # GPU JIT Tests - SM86 (A10G) - Spot + On-Demand Rerun
   # ---------------------------------------------------------------------------
   gpu-tests-a10g:
     name: JIT Unittest ${{ matrix.shard }} (A10G)
@@ -231,10 +355,10 @@ jobs:
       needs.gate.outputs.authorized == 'true' &&
       needs.setup.outputs.skip_build != 'true' &&
       github.event.inputs.skip_gpu != 'true'
-    runs-on: [self-hosted, Linux, X64, gpu, sm86]
+    runs-on: [self-hosted, Linux, X64, gpu, sm86, spot]
     timeout-minutes: 360
     strategy:
-      fail-fast: false
+      fail-fast: true
       matrix:
         shard: [1, 2, 3, 4, 5]
     env:
@@ -242,7 +366,7 @@ jobs:
     steps:
       - name: Cleanup
         run: |
-          # Stop all Docker containers to free GPU memory
+          # Stop all Docker containers to free memory
           docker stop $(docker ps -q) 2>/dev/null || true
           docker rm $(docker ps -aq) 2>/dev/null || true
           # Clean workspace and caches
@@ -256,12 +380,117 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Start spot termination monitor
+        run: ./scripts/task_monitor_spot.sh &
+
       - name: Login to Docker Hub
         uses: docker/login-action@v3
         with:
           username: flashinfer
           password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true  # Don't fail if secret is unavailable (e.g., fork PRs)
+        continue-on-error: true
+
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
+        env:
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run JIT Unittest Part ${{ matrix.shard }}
+        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh
+
+  analyze-gpu-a10g-failure:
+    name: Analyze GPU A10G Failure
+    needs: [setup, gpu-tests-a10g]
+    if: "!cancelled() && (needs.gpu-tests-a10g.result == 'failure' || needs.gpu-tests-a10g.result == 'cancelled')"
+    runs-on: ubuntu-latest
+    outputs:
+      is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
+      rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }}
+    steps:
+      - name: Analyze failure from job logs
+        id: analyze
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          RUN_ID="${{ github.run_id }}"
+          SPOT_TERMINATION=false
+          # Include both failed and cancelled jobs (spot termination can cause either)
+          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
+            --jq '.jobs[] | select(.name | contains("A10G")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
+          if [ -z "$FAILED_JOBS" ]; then
+            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          for JOB_ID in $FAILED_JOBS; do
+            # Download logs (may be ZIP or plain text depending on GitHub API)
+            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
+              continue
+            fi
+            # Try to unzip if it's a ZIP file, otherwise use as-is
+            if file job_log.zip | grep -q "Zip archive"; then
+              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
+            else
+              mv job_log.zip job_log.txt
+            fi
+            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
+              echo "Detected: AWS spot termination marker (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+            if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
+              echo "Detected: infrastructure error pattern (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+          done
+          echo "is_spot_termination=$SPOT_TERMINATION"
+          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
+
+      - name: Build rerun matrix
+        id: matrix
+        if: steps.analyze.outputs.is_spot_termination == 'true'
+        run: |
+          echo 'rerun_matrix={"include":[{"shard":1},{"shard":2},{"shard":3},{"shard":4},{"shard":5}]}' >> $GITHUB_OUTPUT
+
+  gpu-tests-a10g-rerun:
+    name: JIT Rerun ${{ matrix.shard }} (A10G)
+    needs: [setup, analyze-gpu-a10g-failure]
+    if: |
+      !cancelled() &&
+      needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' &&
+      needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != ''
+    runs-on: [self-hosted, Linux, X64, gpu, sm86, on-demand]
+    timeout-minutes: 360
+    strategy:
+      fail-fast: true
+      matrix: ${{ fromJSON(needs.analyze-gpu-a10g-failure.outputs.rerun_matrix) }}
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
+    steps:
+      - name: Cleanup
+        run: |
+          # Stop all Docker containers to free memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+          nvidia-smi || true
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
@@ -274,8 +503,7 @@ jobs:
         run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh
 
   # ---------------------------------------------------------------------------
-  # GPU JIT Tests - SM75 (T4) - sampling tests only
-  # Uses ci/bash.sh with GPU (same as Jenkins)
+  # GPU JIT Tests - SM75 (T4) - Spot + On-Demand Rerun
   # ---------------------------------------------------------------------------
   gpu-tests-t4:
     name: JIT Unittest (T4)
@@ -284,14 +512,14 @@ jobs:
       needs.gate.outputs.authorized == 'true' &&
       needs.setup.outputs.skip_build != 'true' &&
       github.event.inputs.skip_gpu != 'true'
-    runs-on: [self-hosted, Linux, X64, gpu, sm75]
+    runs-on: [self-hosted, Linux, X64, gpu, sm75, spot]
     timeout-minutes: 360
     env:
       DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
     steps:
       - name: Cleanup
         run: |
-          # Stop all Docker containers to free GPU memory
+          # Stop all Docker containers to free memory
           docker stop $(docker ps -q) 2>/dev/null || true
           docker rm $(docker ps -aq) 2>/dev/null || true
           # Clean workspace and caches
@@ -305,12 +533,106 @@ jobs:
         with:
           submodules: recursive
 
+      - name: Start spot termination monitor
+        run: ./scripts/task_monitor_spot.sh &
+
       - name: Login to Docker Hub
         uses: docker/login-action@v3
         with:
           username: flashinfer
           password: ${{ secrets.DOCKERHUB_TOKEN }}
-        continue-on-error: true  # Don't fail if secret is unavailable (e.g., fork PRs)
+        continue-on-error: true
+
+      - name: Show Node Info
+        run: ./scripts/task_show_node_info.sh
+        env:
+          NODE_NAME: ${{ runner.name }}
+          WORKSPACE: ${{ github.workspace }}
+          BUILD_NUMBER: ${{ github.run_number }}
+
+      - name: Run JIT Unittest Part 3 (T4)
+        run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh
+
+  analyze-gpu-t4-failure:
+    name: Analyze GPU T4 Failure
+    needs: [setup, gpu-tests-t4]
+    if: "!cancelled() && (needs.gpu-tests-t4.result == 'failure' || needs.gpu-tests-t4.result == 'cancelled')"
+    runs-on: ubuntu-latest
+    outputs:
+      is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }}
+    steps:
+      - name: Analyze failure from job logs
+        id: analyze
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          RUN_ID="${{ github.run_id }}"
+          SPOT_TERMINATION=false
+          # Include both failed and cancelled jobs (spot termination can cause either)
+          FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \
+            --jq '.jobs[] | select(.name | contains("T4")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id')
+          if [ -z "$FAILED_JOBS" ]; then
+            echo "is_spot_termination=false" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+          for JOB_ID in $FAILED_JOBS; do
+            # Download logs (may be ZIP or plain text depending on GitHub API)
+            if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then
+              continue
+            fi
+            # Try to unzip if it's a ZIP file, otherwise use as-is
+            if file job_log.zip | grep -q "Zip archive"; then
+              unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt
+            else
+              mv job_log.zip job_log.txt
+            fi
+            if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then
+              echo "Detected: AWS spot termination marker (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+            if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then
+              echo "Detected: infrastructure error pattern (job $JOB_ID)"
+              SPOT_TERMINATION=true
+              break
+            fi
+          done
+          echo "is_spot_termination=$SPOT_TERMINATION"
+          echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT
+
+  gpu-tests-t4-rerun:
+    name: JIT Rerun (T4)
+    needs: [setup, analyze-gpu-t4-failure]
+    if: |
+      !cancelled() &&
+      needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true'
+    runs-on: [self-hosted, Linux, X64, gpu, sm75, on-demand]
+    timeout-minutes: 360
+    env:
+      DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
+    steps:
+      - name: Cleanup
+        run: |
+          # Stop all Docker containers to free memory
+          docker stop $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -aq) 2>/dev/null || true
+          # Clean workspace and caches
+          sudo rm -rf ${{ github.workspace }}/* || true
+          sudo rm -rf ${{ github.workspace }}/.[!.]* || true
+          rm -rf ~/.cache/flashinfer_jit || true
+          docker system prune -f || true
+          nvidia-smi || true
+
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: flashinfer
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+        continue-on-error: true
 
       - name: Show Node Info
         run: ./scripts/task_show_node_info.sh
@@ -327,8 +649,19 @@ jobs:
   # ---------------------------------------------------------------------------
   test-results-summary:
     name: Test Results Summary
-    if: always()
-    needs: [gate, setup, aot-build-import, gpu-tests-a10g, gpu-tests-t4]
+    if: "!cancelled()"
+    needs:
+      - gate
+      - setup
+      - aot-build-import
+      - analyze-aot-failure
+      - aot-build-import-rerun
+      - gpu-tests-a10g
+      - analyze-gpu-a10g-failure
+      - gpu-tests-a10g-rerun
+      - gpu-tests-t4
+      - analyze-gpu-t4-failure
+      - gpu-tests-t4-rerun
     runs-on: ubuntu-latest
     steps:
       - name: Check Results
@@ -341,28 +674,56 @@ jobs:
             echo "A contributor in @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY
             exit 0
           fi
+          # Helper function to check job status
+          check_status() {
+            local name=$1 skip=$2 spot=$3 spot_term=$4 rerun=$5
+            echo "$name" >> $GITHUB_STEP_SUMMARY
+            if [ "$skip" == "true" ]; then
+              echo "- Status: Skipped" >> $GITHUB_STEP_SUMMARY
+            elif [ "$spot" == "success" ]; then
+              echo "- Status: Passed (spot)" >> $GITHUB_STEP_SUMMARY
+            elif [ "$spot_term" == "true" ] && [ "$rerun" == "success" ]; then
+              echo "- Status: Passed (on-demand rerun)" >> $GITHUB_STEP_SUMMARY
+            else
+              echo "- Status: Failed" >> $GITHUB_STEP_SUMMARY
+              return 1
+            fi
+            return 0
+          }
+
+          echo "Test Results Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
 
           if [ "${{ needs.setup.outputs.skip_build }}" == "true" ]; then
             echo "Build skipped (docs/config only changes)" >> $GITHUB_STEP_SUMMARY
             exit 0
           fi
 
-          AOT="${{ needs.aot-build-import.result }}"
-          A10G="${{ needs.gpu-tests-a10g.result }}"
-          T4="${{ needs.gpu-tests-t4.result }}"
-          SKIP_AOT="${{ github.event.inputs.skip_aot }}"
-          SKIP_GPU="${{ github.event.inputs.skip_gpu }}"
-
-          echo "AOT Build Import: $AOT" >> $GITHUB_STEP_SUMMARY
-          echo "GPU Tests (A10G): $A10G" >> $GITHUB_STEP_SUMMARY
-          echo "GPU Tests (T4): $T4" >> $GITHUB_STEP_SUMMARY
-
-          # Fail if any required job is not success (unless explicitly skipped)
-          if { [ "$AOT" != "success" ] && [ "$SKIP_AOT" != "true" ]; } || \
-             { [ "$A10G" != "success" ] && [ "$SKIP_GPU" != "true" ]; } || \
-             { [ "$T4" != "success" ] && [ "$SKIP_GPU" != "true" ]; }; then
-            echo "**Tests Failed**" >> $GITHUB_STEP_SUMMARY
+          FAILED=false
+
+          check_status "AOT Build Import Tests" \
+            "${{ github.event.inputs.skip_aot }}" \
+            "${{ needs.aot-build-import.result }}" \
+            "${{ needs.analyze-aot-failure.outputs.is_spot_termination }}" \
+            "${{ needs.aot-build-import-rerun.result }}" || FAILED=true
+
+          echo "" >> $GITHUB_STEP_SUMMARY
+          check_status "GPU Tests (A10G)" \
+            "${{ github.event.inputs.skip_gpu }}" \
+            "${{ needs.gpu-tests-a10g.result }}" \
+            "${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }}" \
+            "${{ needs.gpu-tests-a10g-rerun.result }}" || FAILED=true
+
+          echo "" >> $GITHUB_STEP_SUMMARY
+          check_status "GPU Tests (T4)" \
+            "${{ github.event.inputs.skip_gpu }}" \
+            "${{ needs.gpu-tests-t4.result }}" \
+            "${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \
+            "${{ needs.gpu-tests-t4-rerun.result }}" || FAILED=true
+
+          echo "" >> $GITHUB_STEP_SUMMARY
+          if [ "$FAILED" == "true" ]; then
+            echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY
             exit 1
           fi
-
-          echo "**Tests Passed**" >> $GITHUB_STEP_SUMMARY
+          echo "Result: Tests Passed" >> $GITHUB_STEP_SUMMARY
diff --git a/scripts/task_monitor_spot.sh b/scripts/task_monitor_spot.sh
new file mode 100755
index 0000000000..0ea52828b6
--- /dev/null
+++ b/scripts/task_monitor_spot.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright (c) 2026 by FlashInfer team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Spot Termination Monitor for AWS EC2 Spot Instances
+# Usage: ./scripts/task_monitor_spot.sh &
+
+set -euo pipefail
+
+IMDS_URL="http://169.254.169.254/latest/meta-data/spot/instance-action"
+TOKEN_URL="http://169.254.169.254/latest/api/token"
+CHECK_INTERVAL=5
+
+while true; do
+  # Try IMDSv2 first (token-based)
+  TOKEN=$(curl -s --max-time 2 -X PUT "$TOKEN_URL" \
+    -H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null || true)
+
+  if [ -n "$TOKEN" ]; then
+    # IMDSv2: Use token in header
+    META=$(curl -sf --max-time 2 -H "X-aws-ec2-metadata-token: $TOKEN" "$IMDS_URL" 2>/dev/null || true)
+  else
+    # IMDSv1: Direct access (fallback)
+    META=$(curl -sf --max-time 2 "$IMDS_URL" 2>/dev/null || true)
+  fi
+
+  if echo "$META" | grep -q "terminate"; then
+    # Output GitHub Actions error annotation for visibility
+    echo "::error::FLASHINFER_SPOT_TERMINATION_DETECTED"
+    echo "AWS Spot Termination Notice received at $(date)"
+    echo "Instance will be terminated soon. Job will be rerun on on-demand instance."
+    exit 0
+  fi
+
+  sleep $CHECK_INTERVAL
+done