diff --git a/.github/workflows/_healthcheck_vm.yml b/.github/workflows/_healthcheck_vm.yml new file mode 100644 index 0000000000..4c46c5ee90 --- /dev/null +++ b/.github/workflows/_healthcheck_vm.yml @@ -0,0 +1,118 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: ~monitor a single VM + +on: + workflow_call: + inputs: + vm: + type: string + description: Name of VM + required: true + n_gpus: + type: string + description: Number of GPUs this VM has + required: true + is_recheck: + type: boolean + description: Whether this is a recheck after reboot + required: false + default: false + secrets: + SLACK_WEBHOOK_ADMIN: + description: Slack webhook admin identifier + required: true + SLACK_GITHUB_CI_WEBHOOK: + description: Slack webhook URL for notifications + required: true + VM_KEY: + description: VM user credentials + required: true + PAT: + description: GitHub Personal Access Token + required: true + +jobs: + check-status-and-maybe-shutdown: + environment: main + runs-on: ${{ inputs.vm }} + outputs: + status: ${{ steps.status.outputs.main }} + reboot_needed: ${{ steps.status.outputs.reboot_needed }} + steps: + - name: Check status + id: status + run: | + echo "🔍 Running health check on VM ${{ inputs.vm }}" + + docker run --rm --runtime=nvidia --gpus ${{ inputs.n_gpus }} ubuntu nvidia-smi + + NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + + if [[ $NUM_GPUS -ne ${{ inputs.n_gpus }} ]]; then + echo "Issues with GPU detected" + echo "main=degraded" >> "$GITHUB_OUTPUT" + echo "reboot_needed=true" >> "$GITHUB_OUTPUT" + else + echo "✅ VM ${{ inputs.vm }} is healthy - found $NUM_GPUS/${{ inputs.n_gpus }} GPUs" + echo "main=healthy" >> "$GITHUB_OUTPUT" + echo "reboot_needed=false" >> "$GITHUB_OUTPUT" + fi + + - name: Save reboot status to artifact + if: ${{ inputs.is_recheck != true }} + run: | + mkdir -p /tmp/healthcheck-results + echo "${{ steps.status.outputs.reboot_needed }}" > /tmp/healthcheck-results/${{ inputs.vm }}-reboot-needed.txt + echo "${{ steps.status.outputs.main }}" > /tmp/healthcheck-results/${{ inputs.vm }}-status.txt + + - name: Upload healthcheck results + if: ${{ inputs.is_recheck != true }} + uses: actions/upload-artifact@v4 + with: + name: healthcheck-${{ inputs.vm }} + path: /tmp/healthcheck-results/ + retention-days: 1 + + - name: Take Action on Issues + if: ${{ (steps.status.outputs.main == 'degraded' || failure()) && inputs.is_recheck != true }} + continue-on-error: true + run: | + if [[ "${{ steps.status.outputs.reboot_needed }}" == "true" ]]; then + echo "Rebooting VM..." + echo "Scheduling reboot in 30 seconds to allow workflow to complete..." + echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop && sleep 30 && reboot" > /dev/null 2>&1 &' + echo "Reboot scheduled, workflow will continue..." + fi + + - name: Send Slack Alert & Stop Service for Persistent Issues + if: ${{ (steps.status.outputs.main == 'degraded' || failure()) && inputs.is_recheck == true }} + continue-on-error: true + run: | + MESSAGE='{ + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": ":alert: VM bot 🤖: Hey : VM `${{ inputs.vm }}` still has issues after reboot - stopping service and needs manual intervention." + } + } + ] + }' + + curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }} + + echo "Recheck detected persistent issues - stopping runner service to take VM offline" + echo '${{ secrets.VM_KEY }}' | sudo -S bash -c 'nohup bash -c "sleep 30 && cd /home/azureuser/actions-runner && ./svc.sh stop" > /dev/null 2>&1 &' diff --git a/.github/workflows/healthcheck_vms.yml b/.github/workflows/healthcheck_vms.yml new file mode 100644 index 0000000000..40a5bc2d19 --- /dev/null +++ b/.github/workflows/healthcheck_vms.yml @@ -0,0 +1,145 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +name: VM Health Check and Reboot +on: + schedule: + - cron: '0 7 * * *' + workflow_dispatch: + +jobs: + pre-flight: + runs-on: ubuntu-latest + outputs: + list-of-vms: ${{ steps.main.outputs.main }} + environment: main + steps: + - name: Get list of VMs + id: main + env: + GITHUB_TOKEN: ${{ secrets.PAT }} + run: | + RUNNERS=$(curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + ${{ github.api_url }}/repos/${{ github.repository }}/actions/runners) + + MATRIX=$(echo $RUNNERS \ + | jq -c '[ + .runners[] + | select(.status == "online") + | select(.name | contains("cpu") | not) + | { + "vm": .name + } + ] + ' + ) + echo main=$MATRIX | tee -a "$GITHUB_OUTPUT" + + healthcheck: + needs: pre-flight + strategy: + fail-fast: false + matrix: + include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}} + uses: ./.github/workflows/_healthcheck_vm.yml + with: + vm: ${{ matrix.vm }} + n_gpus: "2" + secrets: + SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} + SLACK_GITHUB_CI_WEBHOOK: ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }} + VM_KEY: ${{ secrets.VM_KEY }} + PAT: ${{ secrets.PAT }} + + check-reboots-needed: + needs: [pre-flight, healthcheck] + if: ${{ always() }} + runs-on: ubuntu-latest + outputs: + has_reboots: ${{ steps.check-artifacts.outputs.has_reboots }} + steps: + - name: Download all healthcheck artifacts + uses: actions/download-artifact@v4 + with: + pattern: healthcheck-* + path: ./healthcheck-results/ + merge-multiple: true + + - name: Check if any VMs needed reboots + id: check-artifacts + env: + VM_LIST: ${{ needs.pre-flight.outputs.list-of-vms }} + run: | + echo "Checking healthcheck artifacts for reboot status..." + HAS_REBOOTS=false + + # Create a list of VMs to check + VM_NAMES=$(echo "$VM_LIST" | jq -r '.[] | .vm') + + # Check each VM's artifact + for VM in $VM_NAMES; do + echo "Checking reboot status for VM: $VM" + + REBOOT_FILE="./healthcheck-results/${VM}-reboot-needed.txt" + if [[ -f "$REBOOT_FILE" ]]; then + REBOOT_NEEDED=$(cat "$REBOOT_FILE") + echo "VM $VM reboot needed: $REBOOT_NEEDED" + + if [[ "$REBOOT_NEEDED" == "true" ]]; then + echo "VM $VM needs/needed a reboot" + HAS_REBOOTS=true + fi + else + echo "WARNING: No artifact found for VM $VM" + fi + done + + if [[ "$HAS_REBOOTS" == "true" ]]; then + echo "At least one VM was rebooted" + echo "has_reboots=true" >> "$GITHUB_OUTPUT" + else + echo "No VMs were rebooted" + echo "has_reboots=false" >> "$GITHUB_OUTPUT" + fi + + wait-for-reboot: + needs: check-reboots-needed + if: ${{ needs.check-reboots-needed.outputs.has_reboots == 'true' }} + runs-on: ubuntu-latest + steps: + - name: Wait for VMs to come back online + run: | + WAIT_MINUTES=3 + echo "Waiting ${WAIT_MINUTES} minutes for rebooted VMs to come back online..." + sleep $((WAIT_MINUTES * 60)) + + recheck: + needs: [pre-flight, wait-for-reboot] + if: ${{ always() && needs.wait-for-reboot.result == 'success' }} + strategy: + fail-fast: false + matrix: + include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}} + uses: ./.github/workflows/_healthcheck_vm.yml + with: + vm: ${{ matrix.vm }} + n_gpus: "2" + is_recheck: true + secrets: + SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} + SLACK_GITHUB_CI_WEBHOOK: ${{ secrets.SLACK_GITHUB_CI_WEBHOOK }} + VM_KEY: ${{ secrets.VM_KEY }} + PAT: ${{ secrets.PAT }}