From f309907e95354d62d3d7cb7dedf519eadbc71fd7 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 12 Sep 2024 16:31:39 +0200 Subject: [PATCH 1/3] Problem: QEMUVM killed before shutdown command When running the controller as a systemd service (the normal usecase in prod) and stopping the service the QEMU process was always killed BEFORE the shutdown command could be sent so the VM could not properly clean up As an additional symptom this error appeared and confused dev and user ``` Sep 05 12:24:03 testing-hetzner python3[2468548]: 2024-09-05 12:24:03,187 | ERROR | Task exception was never retrieved Sep 05 12:24:03 testing-hetzner python3[2468548]: future: exception=QMPCapabilitiesError()> Sep 05 12:24:03 testing-hetzner python3[2468548]: Traceback (most recent call last): Sep 05 12:24:03 testing-hetzner python3[2468548]: File "/opt/aleph-vm/aleph/vm/hypervisors/qemu/qemuvm.py", line 151, in stop Sep 05 12:24:03 testing-hetzner python3[2468548]: self.send_shutdown_message() Sep 05 12:24:03 testing-hetzner python3[2468548]: File "/opt/aleph-vm/aleph/vm/hypervisors/qemu/qemuvm.py", line 141, in send_shutdown_message Sep 05 12:24:03 testing-hetzner python3[2468548]: client = self._get_qmpclient() Sep 05 12:24:03 testing-hetzner python3[2468548]: ^^^^^^^^^^^^^^^^^^^^^ Sep 05 12:24:03 testing-hetzner python3[2468548]: File "/opt/aleph-vm/aleph/vm/hypervisors/qemu/qemuvm.py", line 136, in _get_qmpclient Sep 05 12:24:03 testing-hetzner python3[2468548]: client.connect() Sep 05 12:24:03 testing-hetzner python3[2468548]: File "/opt/aleph-vm/qmp.py", line 162, in connect Sep 05 12:24:03 testing-hetzner python3[2468548]: return self.__negotiate_capabilities() Sep 05 12:24:03 testing-hetzner python3[2468548]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Sep 05 12:24:03 testing-hetzner python3[2468548]: File "/opt/aleph-vm/qmp.py", line 88, in __negotiate_capabilities Sep 05 12:24:03 testing-hetzner python3[2468548]: raise QMPCapabilitiesError Sep 05 12:24:03 testing-hetzner python3[2468548]: qmp.QMPCapabilitiesError Sep 05 12:24:03 testing-hetzner python3[2468548]: 2024-09-05 12:24:03,285 | WARNING | Process terminated with 0 ``` Solution: Use mixed kill mode in Systemd, which will at first only send the term signal to the main process, and give the VM time to properly cleanup and shutdown. Note that some time the "shutdown" error is not acted upon so stoping the process is still necessary. It seems to happend when the boot is not completed yet. So a fallback kill is done after a timeout. --- .../etc/systemd/system/aleph-vm-controller@.service | 6 ++++++ src/aleph/vm/controllers/__main__.py | 1 + 2 files changed, 7 insertions(+) diff --git a/packaging/aleph-vm/etc/systemd/system/aleph-vm-controller@.service b/packaging/aleph-vm/etc/systemd/system/aleph-vm-controller@.service index c817aad1e..a6e74b592 100644 --- a/packaging/aleph-vm/etc/systemd/system/aleph-vm-controller@.service +++ b/packaging/aleph-vm/etc/systemd/system/aleph-vm-controller@.service @@ -11,6 +11,12 @@ WorkingDirectory=/opt/aleph-vm Environment=PYTHONPATH=/opt/aleph-vm/:$PYTHONPATH ExecStart=/usr/bin/python3 -m aleph.vm.controllers --config=/var/lib/aleph/vm/%i-controller.json Restart=on-failure +# Kill Mixed mode is used there so at first only the python controller process receive the SIGTERM signal +# That process catch it and send a qemu command to shut down the Guest VM (for QEMU VM), giving it a +# change to clean up properly and prevent disk corruption. +# After 30s (TimeoutStopSec) if the process is not stopped, the controller and subproccess will be SIGKILL +KillMode=mixed +TimeoutStopSec=30 [Install] WantedBy=multi-user.target diff --git a/src/aleph/vm/controllers/__main__.py b/src/aleph/vm/controllers/__main__.py index 19701c4bf..db5dc75ab 100644 --- a/src/aleph/vm/controllers/__main__.py +++ b/src/aleph/vm/controllers/__main__.py @@ -90,6 +90,7 @@ async def handle_persistent_vm(config: Configuration, execution: MicroVM | QemuV def callback(): """Callback for the signal handler to stop the VM and cleanup properly on SIGTERM.""" + print("Received SIGTERM") loop.create_task(execution.stop()) loop.add_signal_handler(signal.SIGTERM, callback) From 3d35e4bf6acd3dd8309b34f4659158c04be8c542 Mon Sep 17 00:00:00 2001 From: Hugo Herter Date: Fri, 13 Sep 2024 14:32:29 +0200 Subject: [PATCH 2/3] Update packaging/aleph-vm/etc/systemd/system/aleph-vm-controller@.service --- .../etc/systemd/system/aleph-vm-controller@.service | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packaging/aleph-vm/etc/systemd/system/aleph-vm-controller@.service b/packaging/aleph-vm/etc/systemd/system/aleph-vm-controller@.service index a6e74b592..7bbfc67d8 100644 --- a/packaging/aleph-vm/etc/systemd/system/aleph-vm-controller@.service +++ b/packaging/aleph-vm/etc/systemd/system/aleph-vm-controller@.service @@ -11,10 +11,10 @@ WorkingDirectory=/opt/aleph-vm Environment=PYTHONPATH=/opt/aleph-vm/:$PYTHONPATH ExecStart=/usr/bin/python3 -m aleph.vm.controllers --config=/var/lib/aleph/vm/%i-controller.json Restart=on-failure -# Kill Mixed mode is used there so at first only the python controller process receive the SIGTERM signal -# That process catch it and send a qemu command to shut down the Guest VM (for QEMU VM), giving it a -# change to clean up properly and prevent disk corruption. -# After 30s (TimeoutStopSec) if the process is not stopped, the controller and subproccess will be SIGKILL +# KillMode=Mixed is used so initially only the Python controller process receives the SIGTERM signal. +# The controller catches it and sends a QEMU command to shut down the Guest VM, allowing it to clean up +# properly and avoid disk corruption. +# After 30s (TimeoutStopSec), if the process is still running, both the controller and subprocesses receive SIGKILL. KillMode=mixed TimeoutStopSec=30 From 4c171d07f7a166852c78335f47f5eecd63d438e7 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Fri, 13 Sep 2024 16:24:46 +0200 Subject: [PATCH 3/3] Update src/aleph/vm/controllers/__main__.py Co-authored-by: Hugo Herter --- src/aleph/vm/controllers/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aleph/vm/controllers/__main__.py b/src/aleph/vm/controllers/__main__.py index db5dc75ab..519270b48 100644 --- a/src/aleph/vm/controllers/__main__.py +++ b/src/aleph/vm/controllers/__main__.py @@ -90,7 +90,7 @@ async def handle_persistent_vm(config: Configuration, execution: MicroVM | QemuV def callback(): """Callback for the signal handler to stop the VM and cleanup properly on SIGTERM.""" - print("Received SIGTERM") + logger.debug("Received SIGTERM") loop.create_task(execution.stop()) loop.add_signal_handler(signal.SIGTERM, callback)