-
Notifications
You must be signed in to change notification settings - Fork 300
feat(kubevirt): add troubleshoot action to vm_lifecycle tool #653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,133 @@ | ||
| kind: Task | ||
| metadata: | ||
| name: "troubleshoot-vm" | ||
| difficulty: hard | ||
| description: "Use the vm-troubleshoot prompt to diagnose and fix VirtualMachine issues" | ||
| steps: | ||
| setup: | ||
| inline: |- | ||
| #!/usr/bin/env bash | ||
| NS="${EVAL_NAMESPACE:-vm-test}" | ||
| kubectl delete namespace "$NS" --ignore-not-found | ||
| kubectl create namespace "$NS" | ||
|
|
||
| # Create a VM that references a missing Secret for cloud-init | ||
| # The agent should identify the missing Secret and create it to fix the VM | ||
| cat <<EOF | kubectl apply -f - | ||
| apiVersion: kubevirt.io/v1 | ||
| kind: VirtualMachine | ||
| metadata: | ||
| name: broken-vm | ||
| namespace: $NS | ||
| labels: | ||
| app: broken-vm | ||
| spec: | ||
| runStrategy: Always | ||
| template: | ||
| spec: | ||
| domain: | ||
| devices: | ||
| disks: | ||
| - name: containerdisk | ||
| disk: | ||
| bus: virtio | ||
| - name: cloudinit | ||
| disk: | ||
| bus: virtio | ||
| resources: | ||
| requests: | ||
| memory: 2Gi | ||
| terminationGracePeriodSeconds: 0 | ||
| volumes: | ||
| - name: containerdisk | ||
| containerDisk: | ||
| image: quay.io/containerdisks/fedora:latest | ||
| - name: cloudinit | ||
| cloudInitNoCloud: | ||
| secretRef: | ||
| name: vm-cloud-init | ||
| EOF | ||
|
|
||
| # Wait for VM to be created | ||
| kubectl wait --for=create vm/broken-vm -n "$NS" --timeout=10s | ||
| echo "VM created with missing Secret reference - waiting for failure state" | ||
|
|
||
| # Give some time for the VM to attempt to start and fail | ||
| sleep 10 | ||
| verify: | ||
| inline: |- | ||
| #!/usr/bin/env bash | ||
| NS="${EVAL_NAMESPACE:-vm-test}" | ||
|
|
||
| echo "=== Verification: Checking if agent fixed the VM ===" | ||
|
|
||
| # Verify that the VM still exists | ||
| if ! kubectl get virtualmachine broken-vm -n "$NS" > /dev/null 2>&1; then | ||
| echo "✗ VirtualMachine broken-vm no longer exists" | ||
| exit 1 | ||
| fi | ||
| echo "✓ VirtualMachine broken-vm exists" | ||
|
|
||
| # Check if the Secret was created by the agent | ||
| if kubectl get secret vm-cloud-init -n "$NS" > /dev/null 2>&1; then | ||
| echo "✓ Secret vm-cloud-init was created" | ||
| else | ||
| echo "✗ Secret vm-cloud-init was not created - agent did not fix the issue" | ||
| exit 1 | ||
| fi | ||
|
|
||
| # Wait for VM to become ready after the fix (with timeout) | ||
| echo "Waiting for VM to become ready after fix..." | ||
| READY=false | ||
| for i in {1..30}; do | ||
| VM_READY=$(kubectl get virtualmachine broken-vm -n "$NS" -o jsonpath='{.status.ready}' 2>/dev/null || echo "false") | ||
| if [[ "$VM_READY" == "true" ]]; then | ||
| READY=true | ||
| break | ||
| fi | ||
| sleep 5 | ||
| done | ||
|
|
||
| if [[ "$READY" == "true" ]]; then | ||
| echo "✓ VM is now ready - fix was successful!" | ||
| else | ||
| VM_STATUS=$(kubectl get virtualmachine broken-vm -n "$NS" -o jsonpath='{.status.printableStatus}' 2>/dev/null || echo "Unknown") | ||
| echo "⚠ VM is not ready yet (status: $VM_STATUS) - fix may need more time or was incomplete" | ||
| # Don't fail here as the VM may still be starting up | ||
| fi | ||
|
|
||
| # Check if virt-launcher pod exists and is running | ||
| LAUNCHER_POD=$(kubectl get pods -n "$NS" -l kubevirt.io=virt-launcher,vm.kubevirt.io/name=broken-vm -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) | ||
| if [[ -n "$LAUNCHER_POD" ]]; then | ||
| POD_PHASE=$(kubectl get pod "$LAUNCHER_POD" -n "$NS" -o jsonpath='{.status.phase}' 2>/dev/null || true) | ||
| echo "✓ virt-launcher pod exists (phase: $POD_PHASE)" | ||
| else | ||
| echo "ℹ No virt-launcher pod found yet" | ||
| fi | ||
|
|
||
| echo "" | ||
| echo "=== Troubleshooting and Fix Eval Complete ===" | ||
| echo "The agent should have:" | ||
| echo " 1. Used the vm-troubleshoot prompt with namespace=$NS and name=broken-vm" | ||
| echo " 2. Identified the root cause (missing Secret vm-cloud-init)" | ||
| echo " 3. Created the missing Secret to fix the VM" | ||
| echo " 4. Reported the action taken and result" | ||
| echo "" | ||
|
|
||
| exit 0 | ||
ksimon1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| cleanup: | ||
| inline: |- | ||
| #!/usr/bin/env bash | ||
| NS="${EVAL_NAMESPACE:-vm-test}" | ||
| kubectl delete virtualmachine broken-vm -n "$NS" --ignore-not-found | ||
| kubectl delete secret vm-cloud-init -n "$NS" --ignore-not-found | ||
| kubectl delete namespace "$NS" --ignore-not-found | ||
| prompt: | ||
| inline: |- | ||
| There is a VirtualMachine named "broken-vm" in the ${EVAL_NAMESPACE:-vm-test} namespace that is not working correctly. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you tried running this with mcpchecker? IIRC it no longer supports bash substitutions, something we can address in the project but it will lead to this being passed directly to the agent/model in it's current form.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The mcpchecker passed. Since this substitution is in all kubevirt's eval tasks, I would update it in different PR in all tasks. |
||
|
|
||
| Please use the vm-troubleshoot prompt to diagnose the issue with this VirtualMachine. | ||
| Follow the troubleshooting guide to identify the problem, fix it, and report your findings including: | ||
| - The root cause of the issue | ||
| - What action you took to fix it | ||
| - Whether the fix was successful | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| package kubevirt | ||
|
|
||
| import ( | ||
| "k8s.io/apimachinery/pkg/runtime/schema" | ||
| ) | ||
|
|
||
| // KubeVirt core resources | ||
| var ( | ||
| // VirtualMachineGVK is the GroupVersionKind for VirtualMachine resources | ||
| VirtualMachineGVK = schema.GroupVersionKind{ | ||
| Group: "kubevirt.io", | ||
| Version: "v1", | ||
| Kind: "VirtualMachine", | ||
| } | ||
|
|
||
| // VirtualMachineGVR is the GroupVersionResource for VirtualMachine resources | ||
| VirtualMachineGVR = schema.GroupVersionResource{ | ||
| Group: "kubevirt.io", | ||
| Version: "v1", | ||
| Resource: "virtualmachines", | ||
| } | ||
|
|
||
| // VirtualMachineInstanceGVR is the GroupVersionResource for VirtualMachineInstance resources | ||
| VirtualMachineInstanceGVR = schema.GroupVersionResource{ | ||
| Group: "kubevirt.io", | ||
| Version: "v1", | ||
| Resource: "virtualmachineinstances", | ||
| } | ||
| ) | ||
|
|
||
| // CDI (Containerized Data Importer) resources | ||
| var ( | ||
| // DataVolumeGVR is the GroupVersionResource for DataVolume resources | ||
| DataVolumeGVR = schema.GroupVersionResource{ | ||
| Group: "cdi.kubevirt.io", | ||
| Version: "v1beta1", | ||
| Resource: "datavolumes", | ||
| } | ||
|
|
||
| // DataSourceGVR is the GroupVersionResource for DataSource resources | ||
| DataSourceGVR = schema.GroupVersionResource{ | ||
| Group: "cdi.kubevirt.io", | ||
| Version: "v1beta1", | ||
| Resource: "datasources", | ||
| } | ||
| ) | ||
|
|
||
| // Instancetype resources | ||
| var ( | ||
| // VirtualMachineClusterInstancetypeGVR is the GroupVersionResource for cluster-scoped VirtualMachineClusterInstancetype resources | ||
| VirtualMachineClusterInstancetypeGVR = schema.GroupVersionResource{ | ||
| Group: "instancetype.kubevirt.io", | ||
| Version: "v1beta1", | ||
| Resource: "virtualmachineclusterinstancetypes", | ||
| } | ||
|
|
||
| // VirtualMachineInstancetypeGVR is the GroupVersionResource for namespaced VirtualMachineInstancetype resources | ||
| VirtualMachineInstancetypeGVR = schema.GroupVersionResource{ | ||
| Group: "instancetype.kubevirt.io", | ||
| Version: "v1beta1", | ||
| Resource: "virtualmachineinstancetypes", | ||
| } | ||
| ) | ||
|
|
||
| // Preference resources | ||
| var ( | ||
| // VirtualMachineClusterPreferenceGVR is the GroupVersionResource for cluster-scoped VirtualMachineClusterPreference resources | ||
| VirtualMachineClusterPreferenceGVR = schema.GroupVersionResource{ | ||
| Group: "instancetype.kubevirt.io", | ||
| Version: "v1beta1", | ||
| Resource: "virtualmachineclusterpreferences", | ||
| } | ||
|
|
||
| // VirtualMachinePreferenceGVR is the GroupVersionResource for namespaced VirtualMachinePreference resources | ||
| VirtualMachinePreferenceGVR = schema.GroupVersionResource{ | ||
| Group: "instancetype.kubevirt.io", | ||
| Version: "v1beta1", | ||
| Resource: "virtualmachinepreferences", | ||
| } | ||
| ) | ||
|
|
||
| // Kubernetes core resources | ||
| var ( | ||
| // PersistentVolumeClaimGVR is the GroupVersionResource for PersistentVolumeClaim resources | ||
| PersistentVolumeClaimGVR = schema.GroupVersionResource{ | ||
| Group: "", | ||
| Version: "v1", | ||
| Resource: "persistentvolumeclaims", | ||
| } | ||
|
|
||
| // PodGVR is the GroupVersionResource for Pod resources | ||
| PodGVR = schema.GroupVersionResource{ | ||
| Group: "", | ||
| Version: "v1", | ||
| Resource: "pods", | ||
| } | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is also missing from the
TaskAPI at the moment IMHO, we can define this inEvalsbut I also think eachTaskshould be able to assert that tools and/or prompts are called.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@lyarwood +1 here - we have an open discussion trying to figure out how we want to solve this: mcpchecker/mcpchecker#126
Interested in hearing if you have any thoughts 😄