Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions .github/workflows/_healthcheck_vm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: ~monitor a single VM

on:
workflow_call:
inputs:
vm:
type: string
description: Name of VM
required: true
n_gpus:
type: string
description: Number of GPUs this VM has
required: true
is_recheck:
type: boolean
description: Whether this is a recheck after reboot
required: false
default: false
secrets:
SLACK_WEBHOOK_ADMIN:
description: Slack webhook admin identifier
required: true
SLACK_GITHUB_CI_WEBHOOK:
description: Slack webhook URL for notifications
required: true
VM_KEY:
description: VM user credentials
required: true
PAT:
description: GitHub Personal Access Token
required: true

jobs:
check-status-and-maybe-shutdown:
environment: main
runs-on: ${{ inputs.vm }}
outputs:
status: ${{ steps.status.outputs.main }}
reboot_needed: ${{ steps.status.outputs.reboot_needed }}
steps:
- name: Check status
id: status
run: |
echo "🔍 Running health check on VM ${{ inputs.vm }}"

docker run --rm --runtime=nvidia --gpus ${{ inputs.n_gpus }} ubuntu nvidia-smi

NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)

if [[ $NUM_GPUS -ne ${{ inputs.n_gpus }} ]]; then
echo "Issues with GPU detected"
echo "main=degraded" >> "$GITHUB_OUTPUT"
echo "reboot_needed=true" >> "$GITHUB_OUTPUT"
else
echo "✅ VM ${{ inputs.vm }} is healthy - found $NUM_GPUS/${{ inputs.n_gpus }} GPUs"
echo "main=healthy" >> "$GITHUB_OUTPUT"
echo "reboot_needed=false" >> "$GITHUB_OUTPUT"
fi

- name: Save reboot status to artifact
if: ${{ inputs.is_recheck != true }}
run: |
mkdir -p /tmp/healthcheck-results
echo "${{ steps.status.outputs.reboot_needed }}" > /tmp/healthcheck-results/${{ inputs.vm }}-reboot-needed.txt
echo "${{ steps.status.outputs.main }}" > /tmp/healthcheck-results/${{ inputs.vm }}-status.txt

- name: Upload healthcheck results
if: ${{ inputs.is_recheck != true }}
uses: actions/upload-artifact@v4
with:
name: healthcheck-${{ inputs.vm }}
path: /tmp/healthcheck-results/
retention-days: 1

- name: Take Action on Issues
if: ${{ (steps.status.outputs.main == 'degraded' || failure()) && inputs.is_recheck != true }}
continue-on-error: true
run: |
if [[ "${{ steps.status.outputs.reboot_needed }}" == "true" ]]; then
echo "Rebooting VM..."
echo "Scheduling reboot in 30 seconds to allow workflow to complete..."
echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop && sleep 30 && reboot" > /dev/null 2>&1 &'
echo "Reboot scheduled, workflow will continue..."
fi

- name: Send Slack Alert & Stop Service for Persistent Issues
if: ${{ (steps.status.outputs.main == 'degraded' || failure()) && inputs.is_recheck == true }}
continue-on-error: true
run: |
MESSAGE='{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":alert: VM bot 🤖: Hey <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>: VM `${{ inputs.vm }}` still has issues after reboot - stopping service and needs manual intervention."
}
}
]
}'

curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }}

echo "Recheck detected persistent issues - stopping runner service to take VM offline"
echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop" > /dev/null 2>&1 &'
145 changes: 145 additions & 0 deletions .github/workflows/healthcheck_vms.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: VM Health Check and Reboot
on:
schedule:
- cron: '0 7 * * *'
workflow_dispatch:

jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
list-of-vms: ${{ steps.main.outputs.main }}
environment: main
steps:
- name: Get list of VMs
id: main
env:
GITHUB_TOKEN: ${{ secrets.PAT }}
run: |
RUNNERS=$(curl -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
${{ github.api_url }}/repos/${{ github.repository }}/actions/runners)

MATRIX=$(echo $RUNNERS \
| jq -c '[
.runners[]
| select(.status == "online")
| select(.name | contains("cpu") | not)
| {
"vm": .name
}
]
'
)
echo main=$MATRIX | tee -a "$GITHUB_OUTPUT"

healthcheck:
needs: pre-flight
strategy:
fail-fast: false
matrix:
include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}}
uses: ./.github/workflows/_healthcheck_vm.yml
with:
vm: ${{ matrix.vm }}
n_gpus: "2"
secrets:
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
SLACK_GITHUB_CI_WEBHOOK: ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }}
VM_KEY: ${{ secrets.VM_KEY }}
PAT: ${{ secrets.PAT }}

check-reboots-needed:
needs: [pre-flight, healthcheck]
if: ${{ always() }}
runs-on: ubuntu-latest
outputs:
has_reboots: ${{ steps.check-artifacts.outputs.has_reboots }}
steps:
- name: Download all healthcheck artifacts
uses: actions/download-artifact@v4
with:
pattern: healthcheck-*
path: ./healthcheck-results/
merge-multiple: true

- name: Check if any VMs needed reboots
id: check-artifacts
env:
VM_LIST: ${{ needs.pre-flight.outputs.list-of-vms }}
run: |
echo "Checking healthcheck artifacts for reboot status..."
HAS_REBOOTS=false

# Create a list of VMs to check
VM_NAMES=$(echo "$VM_LIST" | jq -r '.[] | .vm')

# Check each VM's artifact
for VM in $VM_NAMES; do
echo "Checking reboot status for VM: $VM"

REBOOT_FILE="./healthcheck-results/${VM}-reboot-needed.txt"
if [[ -f "$REBOOT_FILE" ]]; then
REBOOT_NEEDED=$(cat "$REBOOT_FILE")
echo "VM $VM reboot needed: $REBOOT_NEEDED"

if [[ "$REBOOT_NEEDED" == "true" ]]; then
echo "VM $VM needs/needed a reboot"
HAS_REBOOTS=true
fi
else
echo "WARNING: No artifact found for VM $VM"
fi
done

if [[ "$HAS_REBOOTS" == "true" ]]; then
echo "At least one VM was rebooted"
echo "has_reboots=true" >> "$GITHUB_OUTPUT"
else
echo "No VMs were rebooted"
echo "has_reboots=false" >> "$GITHUB_OUTPUT"
fi

wait-for-reboot:
needs: check-reboots-needed
if: ${{ needs.check-reboots-needed.outputs.has_reboots == 'true' }}
runs-on: ubuntu-latest
steps:
- name: Wait for VMs to come back online
run: |
WAIT_MINUTES=3
echo "Waiting ${WAIT_MINUTES} minutes for rebooted VMs to come back online..."
sleep $((WAIT_MINUTES * 60))

recheck:
needs: [pre-flight, wait-for-reboot]
if: ${{ always() && needs.wait-for-reboot.result == 'success' }}
strategy:
fail-fast: false
matrix:
include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}}
uses: ./.github/workflows/_healthcheck_vm.yml
with:
vm: ${{ matrix.vm }}
n_gpus: "2"
is_recheck: true
secrets:
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
SLACK_GITHUB_CI_WEBHOOK: ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }}
VM_KEY: ${{ secrets.VM_KEY }}
PAT: ${{ secrets.PAT }}