From 31e7d81c628739258e7bbfa6799a350ff2bc53d8 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 18:00:49 -0500 Subject: [PATCH 01/21] ci: Add runner healthcheck Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 102 +++++++++++++++++++++++ .github/workflows/healthcheck_vms.yml | 112 ++++++++++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 .github/workflows/_healthcheck_vm.yml create mode 100644 .github/workflows/healthcheck_vms.yml diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml new file mode 100644 index 0000000000..f4c6a55298 --- /dev/null +++ b/.github/workflows/_healthcheck_vm.yml @@ -0,0 +1,102 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: ~monitor a single VM + +on: + workflow_call: + inputs: + vm: + type: string + description: Name of VM + required: true + n_gpus: + type: string + description: Number of GPUs this VM has + required: true + is_recheck: + type: boolean + description: Whether this is a recheck after reboot + required: false + default: false + secrets: + SLACK_WEBHOOK_ADMIN: + description: Slack webhook admin identifier + required: true + SLACK_RELEASE_ENDPOINT: + description: Slack webhook URL for notifications + required: true + VM_KEY: + description: VM user credentials + required: true + PAT: + description: GitHub Personal Access Token + required: true + +jobs: + check-status-and-maybe-shutdown: + environment: main + runs-on: ${{ inputs.vm }} + outputs: + status: ${{ steps.status.outputs.main }} + reboot_needed: ${{ steps.status.outputs.reboot_needed }} + steps: + - name: Check status + id: status + run: | + echo "🔍 Running health check on VM ${{ inputs.vm }}" + + docker run --rm --runtime=nvidia --gpus ${{ inputs.n_gpus }} ubuntu nvidia-smi + + NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + + if [[ $NUM_GPUS -ne ${{ inputs.n_gpus }} ]]; then + echo "Issues with GPU detected" + echo "main=degraded" >> "$GITHUB_OUTPUT" + echo "reboot_needed=true" >> "$GITHUB_OUTPUT" + else + echo "✅ VM ${{ inputs.vm }} is healthy - found $NUM_GPUS/${{ inputs.n_gpus }} GPUs" + echo "main=healthy" >> "$GITHUB_OUTPUT" + echo "reboot_needed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Take Action on Issues + if: ${{ (steps.status.outputs.main == 'degraded' || failure()) && inputs.is_recheck != true }} + continue-on-error: true + run: | + if [[ "${{ steps.status.outputs.reboot_needed }}" == "true" ]]; then + echo "Rebooting VM..." + echo ${{ secrets.VM_KEY }} | sudo -S reboot -h now + fi + + - name: Send Slack Alert & Stop Service for Persistent Issues + if: ${{ (steps.status.outputs.main == 'degraded' || failure()) && inputs.is_recheck == true }} + continue-on-error: true + run: | + MESSAGE='{ + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":alert: VM bot 🤖: Hey : VM `${{ inputs.vm }}` still has issues after reboot - stopping service and needs manual intervention." + } + } + ] + }' + + curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_RELEASE_ENDPOINT }} + + echo "Recheck detected persistent issues - stopping runner service to take VM offline" + cd /home/azureuser/actions-runner + echo ${{ secrets.VM_KEY }} | sudo -S ./svc.sh stop diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml new file mode 100644 index 0000000000..0a0b47bef6 --- /dev/null +++ b/.github/workflows/healthcheck_vms.yml @@ -0,0 +1,112 @@ +# Regularly updates the CI container +name: Reboots VMs in a controlled way +on: + push: + +jobs: + pre-flight: + runs-on: ubuntu-latest + outputs: + list-of-vms: ${{ steps.main.outputs.main }} + environment: main + steps: + - name: Get list of VMs + id: main + env: + GITHUB_TOKEN: ${{ secrets.PAT }} + run: | + RUNNERS=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + ${{ github.api_url }}/repos/${{ github.repository }}/actions/runners) + + MATRIX=$(echo $RUNNERS \ + | jq -c '[ + .runners[] + | select(.status == "online") + | select(.name | contains("cpu") | not) + | { + "vm": .name, + "n_gpus": [ + .labels[] + | select(.name | endswith("gpu")) | .name + ][0][:1] + } + ] + ' + ) + echo main=$MATRIX | tee -a "$GITHUB_OUTPUT" + + healthcheck: + needs: pre-flight + strategy: + fail-fast: false + matrix: + include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}} + uses: ./.github/workflows/_healthcheck_vm.yml + with: + vm: ${{ matrix.vm }} + n_gpus: ${{ matrix.n_gpus }} + secrets: + SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} + SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }} + VM_KEY: ${{ secrets.VM_KEY }} + PAT: ${{ secrets.PAT }} + + check-offline-runners: + needs: healthcheck + if: ${{ always() }} + runs-on: ubuntu-latest + outputs: + has_offline: ${{ steps.check.outputs.has_offline }} + steps: + - name: Check if any runners are offline + id: check + env: + GITHUB_TOKEN: ${{ secrets.PAT }} + run: | + RUNNERS=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + ${{ github.api_url }}/repos/${{ github.repository }}/actions/runners) + + OFFLINE_COUNT=$(echo $RUNNERS | jq '[.runners[] | select(.status == "offline")] | length') + + if [[ $OFFLINE_COUNT -gt 0 ]]; then + echo "Found $OFFLINE_COUNT offline runners" + echo "has_offline=true" >> "$GITHUB_OUTPUT" + else + echo "All runners are online" + echo "has_offline=false" >> "$GITHUB_OUTPUT" + fi + + wait-for-reboot: + needs: check-offline-runners + if: ${{ needs.check-offline-runners.outputs.has_offline == 'true' }} + runs-on: ubuntu-latest + steps: + - name: Wait for VMs to come back online + run: | + WAIT_MINUTES=5 + echo "Waiting ${WAIT_MINUTES} minutes for rebooted VMs to come back online..." + sleep $((WAIT_MINUTES * 60)) + + recheck: + needs: wait-for-reboot + if: ${{ always() && needs.wait-for-reboot.result == 'success' }} + strategy: + fail-fast: false + matrix: + include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}} + uses: ./.github/workflows/_healthcheck_vm.yml + with: + vm: ${{ matrix.vm }} + n_gpus: ${{ matrix.n_gpus }} + is_recheck: true + secrets: + SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} + SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }} + VM_KEY: ${{ secrets.VM_KEY }} + PAT: ${{ secrets.PAT }} From 378cbcf1176bc8bf26dcc4898ec2aa303990253a Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 18:03:43 -0500 Subject: [PATCH 02/21] Hardcode n_gpus Signed-off-by: Charlie Truong --- .github/workflows/healthcheck_vms.yml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 0a0b47bef6..53fc0e42df 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -27,11 +27,7 @@ jobs: | select(.status == "online") | select(.name | contains("cpu") | not) | { - "vm": .name, - "n_gpus": [ - .labels[] - | select(.name | endswith("gpu")) | .name - ][0][:1] + "vm": .name } ] ' @@ -47,7 +43,7 @@ jobs: uses: ./.github/workflows/_healthcheck_vm.yml with: vm: ${{ matrix.vm }} - n_gpus: ${{ matrix.n_gpus }} + n_gpus: "2" secrets: SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }} @@ -94,7 +90,7 @@ jobs: sleep $((WAIT_MINUTES * 60)) recheck: - needs: wait-for-reboot + needs: [pre-flight, wait-for-reboot] if: ${{ always() && needs.wait-for-reboot.result == 'success' }} strategy: fail-fast: false From d5d32ea03d0cf1474c7ca95a59c17f45590285ff Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 18:04:13 -0500 Subject: [PATCH 03/21] Test runner healthcheck Signed-off-by: Charlie Truong --- .github/workflows/healthcheck_vms.yml | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 53fc0e42df..2f05e5666a 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -15,23 +15,9 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} run: | - RUNNERS=$(curl -L \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - ${{ github.api_url }}/repos/${{ github.repository }}/actions/runners) - - MATRIX=$(echo $RUNNERS \ - | jq -c '[ - .runners[] - | select(.status == "online") - | select(.name | contains("cpu") | not) - | { - "vm": .name - } - ] - ' - ) + # Hardcoded for testing purposes + MATRIX='[{"vm": "azure-gpu-vm-runner2"}]' + echo "Using hardcoded test runner: $MATRIX" echo main=$MATRIX | tee -a "$GITHUB_OUTPUT" healthcheck: From 6336f1415d11ac28076ebe28f14fea1c178cd767 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 18:04:26 -0500 Subject: [PATCH 04/21] Fix n_gpus Signed-off-by: Charlie Truong --- .github/workflows/healthcheck_vms.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 2f05e5666a..580143566e 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -85,7 +85,7 @@ jobs: uses: ./.github/workflows/_healthcheck_vm.yml with: vm: ${{ matrix.vm }} - n_gpus: ${{ matrix.n_gpus }} + n_gpus: "2" is_recheck: true secrets: SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} From d1a1dabf06b8fcf287821642cace73498edfa97d Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 18:40:38 -0500 Subject: [PATCH 05/21] Debug runner check Signed-off-by: Charlie Truong --- .github/workflows/healthcheck_vms.yml | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 580143566e..17bff6ffdd 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -48,19 +48,39 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} run: | + # Debug: Check if token is available + if [[ -z "$GITHUB_TOKEN" ]]; then + echo "ERROR: GITHUB_TOKEN is empty - PAT secret may not be configured" + exit 1 + fi + + echo "Fetching runners from GitHub API..." RUNNERS=$(curl -L \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer $GITHUB_TOKEN" \ -H "X-GitHub-Api-Version: 2022-11-28" \ ${{ github.api_url }}/repos/${{ github.repository }}/actions/runners) - OFFLINE_COUNT=$(echo $RUNNERS | jq '[.runners[] | select(.status == "offline")] | length') + # Debug: Check API response + if [[ -z "$RUNNERS" || "$RUNNERS" == "null" ]]; then + echo "ERROR: API response is empty or null" + echo "Response: $RUNNERS" + exit 1 + fi + + echo "API response received, checking for offline runners..." + OFFLINE_COUNT=$(echo "$RUNNERS" | jq -r '[.runners[]? | select(.status == "offline")] | length') + + if [[ "$OFFLINE_COUNT" == "null" || -z "$OFFLINE_COUNT" ]]; then + echo "WARNING: Could not parse offline count, assuming no offline runners" + OFFLINE_COUNT=0 + fi if [[ $OFFLINE_COUNT -gt 0 ]]; then echo "Found $OFFLINE_COUNT offline runners" echo "has_offline=true" >> "$GITHUB_OUTPUT" else - echo "All runners are online" + echo "All runners are online (checked $OFFLINE_COUNT offline)" echo "has_offline=false" >> "$GITHUB_OUTPUT" fi From 6ecae80191bc58cf1b3468efd77e9e3c8b080a55 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 18:42:57 -0500 Subject: [PATCH 06/21] Ensure the check-offline-runners job runs on main Signed-off-by: Charlie Truong --- .github/workflows/healthcheck_vms.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 17bff6ffdd..259bb4427f 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -40,6 +40,7 @@ jobs: needs: healthcheck if: ${{ always() }} runs-on: ubuntu-latest + environment: main outputs: has_offline: ${{ steps.check.outputs.has_offline }} steps: From baea0fe56ccbe366f665dbe0ffbb79ed408ff27a Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 18:47:10 -0500 Subject: [PATCH 07/21] Test reboot Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 4 ++-- .github/workflows/healthcheck_vms.yml | 22 +--------------------- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml index f4c6a55298..6437beb271 100644 --- a/.github/workflows/_healthcheck_vm.yml +++ b/.github/workflows/_healthcheck_vm.yml @@ -66,8 +66,8 @@ jobs: echo "reboot_needed=true" >> "$GITHUB_OUTPUT" else echo "✅ VM ${{ inputs.vm }} is healthy - found $NUM_GPUS/${{ inputs.n_gpus }} GPUs" - echo "main=healthy" >> "$GITHUB_OUTPUT" - echo "reboot_needed=false" >> "$GITHUB_OUTPUT" + echo "main=degraded" >> "$GITHUB_OUTPUT" + echo "reboot_needed=true" >> "$GITHUB_OUTPUT" fi - name: Take Action on Issues diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 259bb4427f..ba4839cdda 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -49,12 +49,6 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} run: | - # Debug: Check if token is available - if [[ -z "$GITHUB_TOKEN" ]]; then - echo "ERROR: GITHUB_TOKEN is empty - PAT secret may not be configured" - exit 1 - fi - echo "Fetching runners from GitHub API..." RUNNERS=$(curl -L \ -H "Accept: application/vnd.github+json" \ @@ -62,21 +56,7 @@ jobs: -H "X-GitHub-Api-Version: 2022-11-28" \ ${{ github.api_url }}/repos/${{ github.repository }}/actions/runners) - # Debug: Check API response - if [[ -z "$RUNNERS" || "$RUNNERS" == "null" ]]; then - echo "ERROR: API response is empty or null" - echo "Response: $RUNNERS" - exit 1 - fi - - echo "API response received, checking for offline runners..." OFFLINE_COUNT=$(echo "$RUNNERS" | jq -r '[.runners[]? | select(.status == "offline")] | length') - - if [[ "$OFFLINE_COUNT" == "null" || -z "$OFFLINE_COUNT" ]]; then - echo "WARNING: Could not parse offline count, assuming no offline runners" - OFFLINE_COUNT=0 - fi - if [[ $OFFLINE_COUNT -gt 0 ]]; then echo "Found $OFFLINE_COUNT offline runners" echo "has_offline=true" >> "$GITHUB_OUTPUT" @@ -97,7 +77,7 @@ jobs: sleep $((WAIT_MINUTES * 60)) recheck: - needs: [pre-flight, wait-for-reboot] + needs: wait-for-reboot if: ${{ always() && needs.wait-for-reboot.result == 'success' }} strategy: fail-fast: false From fc80d4d50c2e1566da49ea162f145fb9cec04009 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 19:04:59 -0500 Subject: [PATCH 08/21] Debug reboot Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml index 6437beb271..92812e83d9 100644 --- a/.github/workflows/_healthcheck_vm.yml +++ b/.github/workflows/_healthcheck_vm.yml @@ -76,7 +76,14 @@ jobs: run: | if [[ "${{ steps.status.outputs.reboot_needed }}" == "true" ]]; then echo "Rebooting VM..." - echo ${{ secrets.VM_KEY }} | sudo -S reboot -h now + # Add some debug info + if [[ -z "${{ secrets.VM_KEY }}" ]]; then + echo "ERROR: VM_KEY secret is empty" + exit 1 + fi + echo "Scheduling reboot in 30 seconds to allow workflow to complete..." + echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && reboot" > /dev/null 2>&1 &' + echo "Reboot scheduled, workflow will continue..." fi - name: Send Slack Alert & Stop Service for Persistent Issues @@ -98,5 +105,4 @@ jobs: curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_RELEASE_ENDPOINT }} echo "Recheck detected persistent issues - stopping runner service to take VM offline" - cd /home/azureuser/actions-runner - echo ${{ secrets.VM_KEY }} | sudo -S ./svc.sh stop + echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'cd /home/azureuser/actions-runner && ./svc.sh stop' From 692d602144d3769b6e264b93e65347762c1ad596 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 19:11:53 -0500 Subject: [PATCH 09/21] Disconnect runner before rebooting Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml index 92812e83d9..ef9b9ed88f 100644 --- a/.github/workflows/_healthcheck_vm.yml +++ b/.github/workflows/_healthcheck_vm.yml @@ -76,11 +76,8 @@ jobs: run: | if [[ "${{ steps.status.outputs.reboot_needed }}" == "true" ]]; then echo "Rebooting VM..." - # Add some debug info - if [[ -z "${{ secrets.VM_KEY }}" ]]; then - echo "ERROR: VM_KEY secret is empty" - exit 1 - fi + echo "Disconnecting runner from GitHub Actions..." + echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'cd /home/azureuser/actions-runner && ./svc.sh stop' echo "Scheduling reboot in 30 seconds to allow workflow to complete..." echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && reboot" > /dev/null 2>&1 &' echo "Reboot scheduled, workflow will continue..." From beb44fc8b565827536a5523886fc8a95e7e1908b Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 19:18:19 -0500 Subject: [PATCH 10/21] Check waiting on reboot Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 4 +--- .github/workflows/healthcheck_vms.yml | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml index ef9b9ed88f..b8e0115a4f 100644 --- a/.github/workflows/_healthcheck_vm.yml +++ b/.github/workflows/_healthcheck_vm.yml @@ -76,10 +76,8 @@ jobs: run: | if [[ "${{ steps.status.outputs.reboot_needed }}" == "true" ]]; then echo "Rebooting VM..." - echo "Disconnecting runner from GitHub Actions..." - echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'cd /home/azureuser/actions-runner && ./svc.sh stop' echo "Scheduling reboot in 30 seconds to allow workflow to complete..." - echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && reboot" > /dev/null 2>&1 &' + echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop && reboot" > /dev/null 2>&1 &' echo "Reboot scheduled, workflow will continue..." fi diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index ba4839cdda..2f6117a646 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -49,6 +49,8 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} run: | + echo "Waiting 30 seconds for runners to reboot if necessary..." + sleep 30 echo "Fetching runners from GitHub API..." RUNNERS=$(curl -L \ -H "Accept: application/vnd.github+json" \ From 27c2a6315e46acd7025a6f8597f1f6ad9777ab29 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 19:29:49 -0500 Subject: [PATCH 11/21] Adding sleep between stop and reboot Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml index b8e0115a4f..8745997343 100644 --- a/.github/workflows/_healthcheck_vm.yml +++ b/.github/workflows/_healthcheck_vm.yml @@ -77,7 +77,7 @@ jobs: if [[ "${{ steps.status.outputs.reboot_needed }}" == "true" ]]; then echo "Rebooting VM..." echo "Scheduling reboot in 30 seconds to allow workflow to complete..." - echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop && reboot" > /dev/null 2>&1 &' + echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop && sleep 30 && reboot" > /dev/null 2>&1 &' echo "Reboot scheduled, workflow will continue..." fi @@ -100,4 +100,4 @@ jobs: curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_RELEASE_ENDPOINT }} echo "Recheck detected persistent issues - stopping runner service to take VM offline" - echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'cd /home/azureuser/actions-runner && ./svc.sh stop' + echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop' From eeba58b4e8ea5594fa1a219747385c29b1fcbad6 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 19:33:22 -0500 Subject: [PATCH 12/21] Wait to see of runners are offline Signed-off-by: Charlie Truong --- .github/workflows/healthcheck_vms.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 2f6117a646..9b3b8d6c76 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -49,8 +49,8 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} run: | - echo "Waiting 30 seconds for runners to reboot if necessary..." - sleep 30 + echo "Waiting 60 seconds for runners to reboot if necessary..." + sleep 60 echo "Fetching runners from GitHub API..." RUNNERS=$(curl -L \ -H "Accept: application/vnd.github+json" \ From 27447a5d0120b9a06b4a6506de277c887af5b345 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 20:01:46 -0500 Subject: [PATCH 13/21] Pass preflight to recheck Signed-off-by: Charlie Truong --- .github/workflows/healthcheck_vms.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 9b3b8d6c76..5b887db05a 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -79,7 +79,7 @@ jobs: sleep $((WAIT_MINUTES * 60)) recheck: - needs: wait-for-reboot + needs: [preflight, wait-for-reboot] if: ${{ always() && needs.wait-for-reboot.result == 'success' }} strategy: fail-fast: false From 5e8fb4c7e2ded1b24c90a70a0ff93cafe4e14025 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 20:02:52 -0500 Subject: [PATCH 14/21] Fix recheck Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 13 +++++++++++++ .github/workflows/healthcheck_vms.yml | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml index 8745997343..c8a9e5f807 100644 --- a/.github/workflows/_healthcheck_vm.yml +++ b/.github/workflows/_healthcheck_vm.yml @@ -70,6 +70,19 @@ jobs: echo "reboot_needed=true" >> "$GITHUB_OUTPUT" fi + - name: Save reboot status to artifact + run: | + mkdir -p /tmp/healthcheck-results + echo "${{ steps.status.outputs.reboot_needed }}" > /tmp/healthcheck-results/${{ inputs.vm }}-reboot-needed.txt + echo "${{ steps.status.outputs.main }}" > /tmp/healthcheck-results/${{ inputs.vm }}-status.txt + + - name: Upload healthcheck results + uses: actions/upload-artifact@v4 + with: + name: healthcheck-${{ inputs.vm }} + path: /tmp/healthcheck-results/ + retention-days: 1 + - name: Take Action on Issues if: ${{ (steps.status.outputs.main == 'degraded' || failure()) && inputs.is_recheck != true }} continue-on-error: true diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 5b887db05a..814aa1176e 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -79,7 +79,7 @@ jobs: sleep $((WAIT_MINUTES * 60)) recheck: - needs: [preflight, wait-for-reboot] + needs: [pre-flight, wait-for-reboot] if: ${{ always() && needs.wait-for-reboot.result == 'success' }} strategy: fail-fast: false From 4e9dfc456aa9d92250d1ac75596458aad2e48e2d Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 20:08:50 -0500 Subject: [PATCH 15/21] Use artifact to check for reboot Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 2 + .github/workflows/healthcheck_vms.yml | 68 +++++++++++++++++---------- 2 files changed, 46 insertions(+), 24 deletions(-) diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml index c8a9e5f807..02b45c8197 100644 --- a/.github/workflows/_healthcheck_vm.yml +++ b/.github/workflows/_healthcheck_vm.yml @@ -71,12 +71,14 @@ jobs: fi - name: Save reboot status to artifact + if: ${{ always() }} run: | mkdir -p /tmp/healthcheck-results echo "${{ steps.status.outputs.reboot_needed }}" > /tmp/healthcheck-results/${{ inputs.vm }}-reboot-needed.txt echo "${{ steps.status.outputs.main }}" > /tmp/healthcheck-results/${{ inputs.vm }}-status.txt - name: Upload healthcheck results + if: ${{ always() }} uses: actions/upload-artifact@v4 with: name: healthcheck-${{ inputs.vm }} diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 814aa1176e..b4e356b410 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -36,45 +36,65 @@ jobs: VM_KEY: ${{ secrets.VM_KEY }} PAT: ${{ secrets.PAT }} - check-offline-runners: - needs: healthcheck + check-reboots-needed: + needs: [pre-flight, healthcheck] if: ${{ always() }} runs-on: ubuntu-latest - environment: main outputs: - has_offline: ${{ steps.check.outputs.has_offline }} + has_reboots: ${{ steps.check-artifacts.outputs.has_reboots }} steps: - - name: Check if any runners are offline - id: check + - name: Download all healthcheck artifacts + uses: actions/download-artifact@v4 + with: + pattern: healthcheck-* + path: ./healthcheck-results/ + merge-multiple: true + + - name: Check if any VMs needed reboots + id: check-artifacts env: - GITHUB_TOKEN: ${{ secrets.PAT }} + VM_LIST: ${{ needs.pre-flight.outputs.list-of-vms }} run: | - echo "Waiting 60 seconds for runners to reboot if necessary..." - sleep 60 - echo "Fetching runners from GitHub API..." - RUNNERS=$(curl -L \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - ${{ github.api_url }}/repos/${{ github.repository }}/actions/runners) + echo "Checking healthcheck artifacts for reboot status..." + HAS_REBOOTS=false + + # Create a list of VMs to check + VM_NAMES=$(echo "$VM_LIST" | jq -r '.[] | .vm') + + # Check each VM's artifact + for VM in $VM_NAMES; do + echo "Checking reboot status for VM: $VM" + + REBOOT_FILE="./healthcheck-results/${VM}-reboot-needed.txt" + if [[ -f "$REBOOT_FILE" ]]; then + REBOOT_NEEDED=$(cat "$REBOOT_FILE") + echo "VM $VM reboot needed: $REBOOT_NEEDED" + + if [[ "$REBOOT_NEEDED" == "true" ]]; then + echo "VM $VM needs/needed a reboot" + HAS_REBOOTS=true + fi + else + echo "WARNING: No artifact found for VM $VM" + fi + done - OFFLINE_COUNT=$(echo "$RUNNERS" | jq -r '[.runners[]? | select(.status == "offline")] | length') - if [[ $OFFLINE_COUNT -gt 0 ]]; then - echo "Found $OFFLINE_COUNT offline runners" - echo "has_offline=true" >> "$GITHUB_OUTPUT" + if [[ "$HAS_REBOOTS" == "true" ]]; then + echo "At least one VM was rebooted" + echo "has_reboots=true" >> "$GITHUB_OUTPUT" else - echo "All runners are online (checked $OFFLINE_COUNT offline)" - echo "has_offline=false" >> "$GITHUB_OUTPUT" + echo "No VMs were rebooted" + echo "has_reboots=false" >> "$GITHUB_OUTPUT" fi wait-for-reboot: - needs: check-offline-runners - if: ${{ needs.check-offline-runners.outputs.has_offline == 'true' }} + needs: check-reboots-needed + if: ${{ needs.check-reboots-needed.outputs.has_reboots == 'true' }} runs-on: ubuntu-latest steps: - name: Wait for VMs to come back online run: | - WAIT_MINUTES=5 + WAIT_MINUTES=3 echo "Waiting ${WAIT_MINUTES} minutes for rebooted VMs to come back online..." sleep $((WAIT_MINUTES * 60)) From f039f4b29b9bf3c2ab53d636ebae056ce2dc2a86 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 20:22:00 -0500 Subject: [PATCH 16/21] Fix recheck Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 8 ++++---- .github/workflows/healthcheck_vms.yml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml index 02b45c8197..70b7551db3 100644 --- a/.github/workflows/_healthcheck_vm.yml +++ b/.github/workflows/_healthcheck_vm.yml @@ -33,7 +33,7 @@ on: SLACK_WEBHOOK_ADMIN: description: Slack webhook admin identifier required: true - SLACK_RELEASE_ENDPOINT: + SLACK_GITHUB_CI_WEBHOOK: description: Slack webhook URL for notifications required: true VM_KEY: @@ -71,14 +71,14 @@ jobs: fi - name: Save reboot status to artifact - if: ${{ always() }} + if: ${{ inputs.is_recheck != true }} run: | mkdir -p /tmp/healthcheck-results echo "${{ steps.status.outputs.reboot_needed }}" > /tmp/healthcheck-results/${{ inputs.vm }}-reboot-needed.txt echo "${{ steps.status.outputs.main }}" > /tmp/healthcheck-results/${{ inputs.vm }}-status.txt - name: Upload healthcheck results - if: ${{ always() }} + if: ${{ inputs.is_recheck != true }} uses: actions/upload-artifact@v4 with: name: healthcheck-${{ inputs.vm }} @@ -112,7 +112,7 @@ jobs: ] }' - curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_RELEASE_ENDPOINT }} + curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }} echo "Recheck detected persistent issues - stopping runner service to take VM offline" echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop' diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index b4e356b410..2f4522e61f 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -32,7 +32,7 @@ jobs: n_gpus: "2" secrets: SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} - SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }} + SLACK_GITHUB_CI_WEBHOOK: ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }} VM_KEY: ${{ secrets.VM_KEY }} PAT: ${{ secrets.PAT }} @@ -112,6 +112,6 @@ jobs: is_recheck: true secrets: SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} - SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }} + SLACK_GITHUB_CI_WEBHOOK: ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }} VM_KEY: ${{ secrets.VM_KEY }} PAT: ${{ secrets.PAT }} From 6ba86518fe09a94bb24f97af4ef9bb52daff6f8f Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 20:26:18 -0500 Subject: [PATCH 17/21] Revert runner matrix check Signed-off-by: Charlie Truong --- .github/workflows/healthcheck_vms.yml | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 2f4522e61f..75b99a1dd2 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -15,9 +15,23 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} run: | - # Hardcoded for testing purposes - MATRIX='[{"vm": "azure-gpu-vm-runner2"}]' - echo "Using hardcoded test runner: $MATRIX" + RUNNERS=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + ${{ github.api_url }}/repos/${{ github.repository }}/actions/runners) + + MATRIX=$(echo $RUNNERS \ + | jq -c '[ + .runners[] + | select(.status == "online") + | select(.name | contains("cpu") | not) + | { + "vm": .name + } + ] + ' + ) echo main=$MATRIX | tee -a "$GITHUB_OUTPUT" healthcheck: From 501ef3f3643c13d7fbf9677c38c423edd9c3b0a6 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 20:30:02 -0500 Subject: [PATCH 18/21] Try again Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 2 +- .github/workflows/healthcheck_vms.yml | 20 +++----------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml index 70b7551db3..59979362ad 100644 --- a/.github/workflows/_healthcheck_vm.yml +++ b/.github/workflows/_healthcheck_vm.yml @@ -115,4 +115,4 @@ jobs: curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }} echo "Recheck detected persistent issues - stopping runner service to take VM offline" - echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop' + echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop" > /dev/null 2>&1 &' diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 75b99a1dd2..2f4522e61f 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -15,23 +15,9 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} run: | - RUNNERS=$(curl -L \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer $GITHUB_TOKEN" \ - -H "X-GitHub-Api-Version: 2022-11-28" \ - ${{ github.api_url }}/repos/${{ github.repository }}/actions/runners) - - MATRIX=$(echo $RUNNERS \ - | jq -c '[ - .runners[] - | select(.status == "online") - | select(.name | contains("cpu") | not) - | { - "vm": .name - } - ] - ' - ) + # Hardcoded for testing purposes + MATRIX='[{"vm": "azure-gpu-vm-runner2"}]' + echo "Using hardcoded test runner: $MATRIX" echo main=$MATRIX | tee -a "$GITHUB_OUTPUT" healthcheck: From 35ed28d6c86b13645a79aca652905bf042cf7b7c Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 20:31:03 -0500 Subject: [PATCH 19/21] Revert matrix Signed-off-by: Charlie Truong --- .github/workflows/healthcheck_vms.yml | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 2f4522e61f..75b99a1dd2 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -15,9 +15,23 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.PAT }} run: | - # Hardcoded for testing purposes - MATRIX='[{"vm": "azure-gpu-vm-runner2"}]' - echo "Using hardcoded test runner: $MATRIX" + RUNNERS=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + ${{ github.api_url }}/repos/${{ github.repository }}/actions/runners) + + MATRIX=$(echo $RUNNERS \ + | jq -c '[ + .runners[] + | select(.status == "online") + | select(.name | contains("cpu") | not) + | { + "vm": .name + } + ] + ' + ) echo main=$MATRIX | tee -a "$GITHUB_OUTPUT" healthcheck: From 02aec65a2e03e56f16455de55449549f3765f5c9 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 20:32:15 -0500 Subject: [PATCH 20/21] Reset healthcheck Signed-off-by: Charlie Truong --- .github/workflows/_healthcheck_vm.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml index 59979362ad..4c46c5ee90 100644 --- a/.github/workflows/_healthcheck_vm.yml +++ b/.github/workflows/_healthcheck_vm.yml @@ -66,8 +66,8 @@ jobs: echo "reboot_needed=true" >> "$GITHUB_OUTPUT" else echo "✅ VM ${{ inputs.vm }} is healthy - found $NUM_GPUS/${{ inputs.n_gpus }} GPUs" - echo "main=degraded" >> "$GITHUB_OUTPUT" - echo "reboot_needed=true" >> "$GITHUB_OUTPUT" + echo "main=healthy" >> "$GITHUB_OUTPUT" + echo "reboot_needed=false" >> "$GITHUB_OUTPUT" fi - name: Save reboot status to artifact From 74e83d83a461391ae197723a79178858fa651527 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Sun, 31 Aug 2025 20:38:31 -0500 Subject: [PATCH 21/21] Update VM health check to run on a schedule Signed-off-by: Charlie Truong --- .github/workflows/healthcheck_vms.yml | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml index 75b99a1dd2..40a5bc2d19 100644 --- a/.github/workflows/healthcheck_vms.yml +++ b/.github/workflows/healthcheck_vms.yml @@ -1,7 +1,21 @@ -# Regularly updates the CI container -name: Reboots VMs in a controlled way +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: VM Health Check and Reboot on: - push: + schedule: + - cron: '0 7 * * *' + workflow_dispatch: jobs: pre-flight: