diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index ee04965583..6e235d5e28 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -7,6 +7,14 @@ # - PR from org members (ci-users team): Runs automatically # - PR from external contributors: Requires 'run-ci' label # (added via @flashinfer-bot run command from authorized user) +# +# Rerun Strategy: +# - Spot jobs run with fail-fast: true +# - Background monitor checks AWS metadata for spot termination notice +# - If termination detected, writes marker to log (captured by GitHub) +# - Analyze job checks logs for marker to decide if should rerun +# - Spot termination: rerun all failed/cancelled jobs on on-demand +# - Real failure: no rerun, workflow fails fast name: PR Test @@ -34,6 +42,7 @@ concurrency: permissions: contents: read pull-requests: write + actions: read env: EXECUTOR_NUMBER: "0" @@ -164,8 +173,7 @@ jobs: fi # --------------------------------------------------------------------------- - # AOT Build Import Tests - x86_64 and aarch64 (multiple CUDA versions) - # Uses ci/bash.sh with --no-gpu (same as Jenkins) + # AOT Build Import Tests (Spot + On-Demand Rerun) # --------------------------------------------------------------------------- aot-build-import: name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }}) @@ -179,9 +187,10 @@ jobs: - Linux - ${{ matrix.arch }} - cpu + - spot timeout-minutes: 360 strategy: - fail-fast: false + fail-fast: true matrix: arch: [X64, ARM64] cuda: [cu126, cu128, cu129, cu130] @@ -203,12 +212,128 @@ jobs: with: submodules: recursive + - name: Start spot termination monitor + run: ./scripts/task_monitor_spot.sh & + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run Test + run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh + + analyze-aot-failure: + name: Analyze AOT Failure + needs: [setup, aot-build-import] + if: "!cancelled() && (needs.aot-build-import.result == 'failure' || needs.aot-build-import.result == 'cancelled')" + runs-on: ubuntu-latest + outputs: + is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} + rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} + steps: + - name: Analyze failure from job logs + id: analyze + env: + GH_TOKEN: ${{ github.token }} + run: | + RUN_ID="${{ github.run_id }}" + SPOT_TERMINATION=false + # Include both failed and cancelled jobs (spot termination can cause either) + FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq '.jobs[] | select(.name | startswith("AOT")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') + if [ -z "$FAILED_JOBS" ]; then + echo "is_spot_termination=false" >> $GITHUB_OUTPUT + exit 0 + fi + for JOB_ID in $FAILED_JOBS; do + # Download logs (may be ZIP or plain text depending on GitHub API) + if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then + continue + fi + # Try to unzip if it's a ZIP file, otherwise use as-is + if file job_log.zip | grep -q "Zip archive"; then + unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt + else + mv job_log.zip job_log.txt + fi + if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then + echo "Detected: AWS spot termination marker (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then + echo "Detected: infrastructure error pattern (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + done + echo "is_spot_termination=$SPOT_TERMINATION" + echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT + + - name: Build rerun matrix + id: matrix + if: steps.analyze.outputs.is_spot_termination == 'true' + run: | + MATRIX='{"include":[' + for arch in X64 ARM64; do + for cuda in cu126 cu128 cu129 cu130; do + MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},' + done + done + MATRIX="${MATRIX%,}]}" + echo "rerun_matrix=$MATRIX" >> $GITHUB_OUTPUT + + aot-build-import-rerun: + name: AOT Build Import Rerun (${{ matrix.arch }}, ${{ matrix.cuda }}) + needs: [setup, analyze-aot-failure] + if: | + !cancelled() && + needs.analyze-aot-failure.outputs.is_spot_termination == 'true' && + needs.analyze-aot-failure.outputs.rerun_matrix != '' + runs-on: + - self-hosted + - Linux + - ${{ matrix.arch }} + - cpu + - on-demand + timeout-minutes: 360 + strategy: + fail-fast: true + matrix: ${{ fromJSON(needs.analyze-aot-failure.outputs.rerun_matrix) }} + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }} + steps: + - name: Cleanup + run: | + # Stop all Docker containers to free memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + + - uses: actions/checkout@v4 + with: + submodules: recursive + - name: Login to Docker Hub uses: docker/login-action@v3 with: username: flashinfer password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true # Don't fail if secret is unavailable (e.g., fork PRs) + continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh @@ -217,12 +342,11 @@ jobs: WORKSPACE: ${{ github.workspace }} BUILD_NUMBER: ${{ github.run_number }} - - name: Test JIT Cache Package Build and Import + - name: Run Test run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh # --------------------------------------------------------------------------- - # GPU JIT Tests - SM86 (A10G) - 5 Shards - # Uses ci/bash.sh with GPU (same as Jenkins) + # GPU JIT Tests - SM86 (A10G) - Spot + On-Demand Rerun # --------------------------------------------------------------------------- gpu-tests-a10g: name: JIT Unittest ${{ matrix.shard }} (A10G) @@ -231,10 +355,10 @@ jobs: needs.gate.outputs.authorized == 'true' && needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm86] + runs-on: [self-hosted, Linux, X64, gpu, sm86, spot] timeout-minutes: 360 strategy: - fail-fast: false + fail-fast: true matrix: shard: [1, 2, 3, 4, 5] env: @@ -242,7 +366,7 @@ jobs: steps: - name: Cleanup run: | - # Stop all Docker containers to free GPU memory + # Stop all Docker containers to free memory docker stop $(docker ps -q) 2>/dev/null || true docker rm $(docker ps -aq) 2>/dev/null || true # Clean workspace and caches @@ -256,12 +380,117 @@ jobs: with: submodules: recursive + - name: Start spot termination monitor + run: ./scripts/task_monitor_spot.sh & + - name: Login to Docker Hub uses: docker/login-action@v3 with: username: flashinfer password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true # Don't fail if secret is unavailable (e.g., fork PRs) + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run JIT Unittest Part ${{ matrix.shard }} + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh + + analyze-gpu-a10g-failure: + name: Analyze GPU A10G Failure + needs: [setup, gpu-tests-a10g] + if: "!cancelled() && (needs.gpu-tests-a10g.result == 'failure' || needs.gpu-tests-a10g.result == 'cancelled')" + runs-on: ubuntu-latest + outputs: + is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} + rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} + steps: + - name: Analyze failure from job logs + id: analyze + env: + GH_TOKEN: ${{ github.token }} + run: | + RUN_ID="${{ github.run_id }}" + SPOT_TERMINATION=false + # Include both failed and cancelled jobs (spot termination can cause either) + FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq '.jobs[] | select(.name | contains("A10G")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') + if [ -z "$FAILED_JOBS" ]; then + echo "is_spot_termination=false" >> $GITHUB_OUTPUT + exit 0 + fi + for JOB_ID in $FAILED_JOBS; do + # Download logs (may be ZIP or plain text depending on GitHub API) + if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then + continue + fi + # Try to unzip if it's a ZIP file, otherwise use as-is + if file job_log.zip | grep -q "Zip archive"; then + unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt + else + mv job_log.zip job_log.txt + fi + if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then + echo "Detected: AWS spot termination marker (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then + echo "Detected: infrastructure error pattern (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + done + echo "is_spot_termination=$SPOT_TERMINATION" + echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT + + - name: Build rerun matrix + id: matrix + if: steps.analyze.outputs.is_spot_termination == 'true' + run: | + echo 'rerun_matrix={"include":[{"shard":1},{"shard":2},{"shard":3},{"shard":4},{"shard":5}]}' >> $GITHUB_OUTPUT + + gpu-tests-a10g-rerun: + name: JIT Rerun ${{ matrix.shard }} (A10G) + needs: [setup, analyze-gpu-a10g-failure] + if: | + !cancelled() && + needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' && + needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != '' + runs-on: [self-hosted, Linux, X64, gpu, sm86, on-demand] + timeout-minutes: 360 + strategy: + fail-fast: true + matrix: ${{ fromJSON(needs.analyze-gpu-a10g-failure.outputs.rerun_matrix) }} + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} + steps: + - name: Cleanup + run: | + # Stop all Docker containers to free memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + nvidia-smi || true + + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh @@ -274,8 +503,7 @@ jobs: run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh # --------------------------------------------------------------------------- - # GPU JIT Tests - SM75 (T4) - sampling tests only - # Uses ci/bash.sh with GPU (same as Jenkins) + # GPU JIT Tests - SM75 (T4) - Spot + On-Demand Rerun # --------------------------------------------------------------------------- gpu-tests-t4: name: JIT Unittest (T4) @@ -284,14 +512,14 @@ jobs: needs.gate.outputs.authorized == 'true' && needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' - runs-on: [self-hosted, Linux, X64, gpu, sm75] + runs-on: [self-hosted, Linux, X64, gpu, sm75, spot] timeout-minutes: 360 env: DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} steps: - name: Cleanup run: | - # Stop all Docker containers to free GPU memory + # Stop all Docker containers to free memory docker stop $(docker ps -q) 2>/dev/null || true docker rm $(docker ps -aq) 2>/dev/null || true # Clean workspace and caches @@ -305,12 +533,106 @@ jobs: with: submodules: recursive + - name: Start spot termination monitor + run: ./scripts/task_monitor_spot.sh & + - name: Login to Docker Hub uses: docker/login-action@v3 with: username: flashinfer password: ${{ secrets.DOCKERHUB_TOKEN }} - continue-on-error: true # Don't fail if secret is unavailable (e.g., fork PRs) + continue-on-error: true + + - name: Show Node Info + run: ./scripts/task_show_node_info.sh + env: + NODE_NAME: ${{ runner.name }} + WORKSPACE: ${{ github.workspace }} + BUILD_NUMBER: ${{ github.run_number }} + + - name: Run JIT Unittest Part 3 (T4) + run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh + + analyze-gpu-t4-failure: + name: Analyze GPU T4 Failure + needs: [setup, gpu-tests-t4] + if: "!cancelled() && (needs.gpu-tests-t4.result == 'failure' || needs.gpu-tests-t4.result == 'cancelled')" + runs-on: ubuntu-latest + outputs: + is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} + steps: + - name: Analyze failure from job logs + id: analyze + env: + GH_TOKEN: ${{ github.token }} + run: | + RUN_ID="${{ github.run_id }}" + SPOT_TERMINATION=false + # Include both failed and cancelled jobs (spot termination can cause either) + FAILED_JOBS=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}/jobs?per_page=100" \ + --jq '.jobs[] | select(.name | contains("T4")) | select(.conclusion == "failure" or .conclusion == "cancelled") | .id') + if [ -z "$FAILED_JOBS" ]; then + echo "is_spot_termination=false" >> $GITHUB_OUTPUT + exit 0 + fi + for JOB_ID in $FAILED_JOBS; do + # Download logs (may be ZIP or plain text depending on GitHub API) + if ! gh api "/repos/${{ github.repository }}/actions/jobs/${JOB_ID}/logs" > job_log.zip 2>/dev/null; then + continue + fi + # Try to unzip if it's a ZIP file, otherwise use as-is + if file job_log.zip | grep -q "Zip archive"; then + unzip -p job_log.zip > job_log.txt 2>/dev/null || mv job_log.zip job_log.txt + else + mv job_log.zip job_log.txt + fi + if grep -q "FLASHINFER_SPOT_TERMINATION_DETECTED" job_log.txt; then + echo "Detected: AWS spot termination marker (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + if grep -qiE "connection reset by peer|context canceled|grpc.*closing|The self-hosted runner.*lost" job_log.txt; then + echo "Detected: infrastructure error pattern (job $JOB_ID)" + SPOT_TERMINATION=true + break + fi + done + echo "is_spot_termination=$SPOT_TERMINATION" + echo "is_spot_termination=$SPOT_TERMINATION" >> $GITHUB_OUTPUT + + gpu-tests-t4-rerun: + name: JIT Rerun (T4) + needs: [setup, analyze-gpu-t4-failure] + if: | + !cancelled() && + needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true' + runs-on: [self-hosted, Linux, X64, gpu, sm75, on-demand] + timeout-minutes: 360 + env: + DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} + steps: + - name: Cleanup + run: | + # Stop all Docker containers to free memory + docker stop $(docker ps -q) 2>/dev/null || true + docker rm $(docker ps -aq) 2>/dev/null || true + # Clean workspace and caches + sudo rm -rf ${{ github.workspace }}/* || true + sudo rm -rf ${{ github.workspace }}/.[!.]* || true + rm -rf ~/.cache/flashinfer_jit || true + docker system prune -f || true + nvidia-smi || true + + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: flashinfer + password: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true - name: Show Node Info run: ./scripts/task_show_node_info.sh @@ -327,8 +649,19 @@ jobs: # --------------------------------------------------------------------------- test-results-summary: name: Test Results Summary - if: always() - needs: [gate, setup, aot-build-import, gpu-tests-a10g, gpu-tests-t4] + if: "!cancelled()" + needs: + - gate + - setup + - aot-build-import + - analyze-aot-failure + - aot-build-import-rerun + - gpu-tests-a10g + - analyze-gpu-a10g-failure + - gpu-tests-a10g-rerun + - gpu-tests-t4 + - analyze-gpu-t4-failure + - gpu-tests-t4-rerun runs-on: ubuntu-latest steps: - name: Check Results @@ -341,28 +674,56 @@ jobs: echo "A contributor in @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY exit 0 fi + # Helper function to check job status + check_status() { + local name=$1 skip=$2 spot=$3 spot_term=$4 rerun=$5 + echo "$name" >> $GITHUB_STEP_SUMMARY + if [ "$skip" == "true" ]; then + echo "- Status: Skipped" >> $GITHUB_STEP_SUMMARY + elif [ "$spot" == "success" ]; then + echo "- Status: Passed (spot)" >> $GITHUB_STEP_SUMMARY + elif [ "$spot_term" == "true" ] && [ "$rerun" == "success" ]; then + echo "- Status: Passed (on-demand rerun)" >> $GITHUB_STEP_SUMMARY + else + echo "- Status: Failed" >> $GITHUB_STEP_SUMMARY + return 1 + fi + return 0 + } + + echo "Test Results Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY if [ "${{ needs.setup.outputs.skip_build }}" == "true" ]; then echo "Build skipped (docs/config only changes)" >> $GITHUB_STEP_SUMMARY exit 0 fi - AOT="${{ needs.aot-build-import.result }}" - A10G="${{ needs.gpu-tests-a10g.result }}" - T4="${{ needs.gpu-tests-t4.result }}" - SKIP_AOT="${{ github.event.inputs.skip_aot }}" - SKIP_GPU="${{ github.event.inputs.skip_gpu }}" - - echo "AOT Build Import: $AOT" >> $GITHUB_STEP_SUMMARY - echo "GPU Tests (A10G): $A10G" >> $GITHUB_STEP_SUMMARY - echo "GPU Tests (T4): $T4" >> $GITHUB_STEP_SUMMARY - - # Fail if any required job is not success (unless explicitly skipped) - if { [ "$AOT" != "success" ] && [ "$SKIP_AOT" != "true" ]; } || \ - { [ "$A10G" != "success" ] && [ "$SKIP_GPU" != "true" ]; } || \ - { [ "$T4" != "success" ] && [ "$SKIP_GPU" != "true" ]; }; then - echo "**Tests Failed**" >> $GITHUB_STEP_SUMMARY + FAILED=false + + check_status "AOT Build Import Tests" \ + "${{ github.event.inputs.skip_aot }}" \ + "${{ needs.aot-build-import.result }}" \ + "${{ needs.analyze-aot-failure.outputs.is_spot_termination }}" \ + "${{ needs.aot-build-import-rerun.result }}" || FAILED=true + + echo "" >> $GITHUB_STEP_SUMMARY + check_status "GPU Tests (A10G)" \ + "${{ github.event.inputs.skip_gpu }}" \ + "${{ needs.gpu-tests-a10g.result }}" \ + "${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }}" \ + "${{ needs.gpu-tests-a10g-rerun.result }}" || FAILED=true + + echo "" >> $GITHUB_STEP_SUMMARY + check_status "GPU Tests (T4)" \ + "${{ github.event.inputs.skip_gpu }}" \ + "${{ needs.gpu-tests-t4.result }}" \ + "${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \ + "${{ needs.gpu-tests-t4-rerun.result }}" || FAILED=true + + echo "" >> $GITHUB_STEP_SUMMARY + if [ "$FAILED" == "true" ]; then + echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY exit 1 fi - - echo "**Tests Passed**" >> $GITHUB_STEP_SUMMARY + echo "Result: Tests Passed" >> $GITHUB_STEP_SUMMARY diff --git a/scripts/task_monitor_spot.sh b/scripts/task_monitor_spot.sh new file mode 100755 index 0000000000..0ea52828b6 --- /dev/null +++ b/scripts/task_monitor_spot.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# Copyright (c) 2026 by FlashInfer team. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Spot Termination Monitor for AWS EC2 Spot Instances +# Usage: ./scripts/task_monitor_spot.sh & + +set -euo pipefail + +IMDS_URL="http://169.254.169.254/latest/meta-data/spot/instance-action" +TOKEN_URL="http://169.254.169.254/latest/api/token" +CHECK_INTERVAL=5 + +while true; do + # Try IMDSv2 first (token-based) + TOKEN=$(curl -s --max-time 2 -X PUT "$TOKEN_URL" \ + -H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null || true) + + if [ -n "$TOKEN" ]; then + # IMDSv2: Use token in header + META=$(curl -sf --max-time 2 -H "X-aws-ec2-metadata-token: $TOKEN" "$IMDS_URL" 2>/dev/null || true) + else + # IMDSv1: Direct access (fallback) + META=$(curl -sf --max-time 2 "$IMDS_URL" 2>/dev/null || true) + fi + + if echo "$META" | grep -q "terminate"; then + # Output GitHub Actions error annotation for visibility + echo "::error::FLASHINFER_SPOT_TERMINATION_DETECTED" + echo "AWS Spot Termination Notice received at $(date)" + echo "Instance will be terminated soon. Job will be rerun on on-demand instance." + exit 0 + fi + + sleep $CHECK_INTERVAL +done