From 8757b06b2cc8e5c2ac011d858d19dfdd5c621faf Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Fri, 21 Feb 2025 12:46:10 +0000 Subject: [PATCH 1/2] refactor: Rename VmError::VmFd to VmError::CreateVm The error variant is used when creating a new VM from KVM FD via KVM_CREATE_VM call. On success, VM FD is returned. The previous error message doesn't tell a lie but is a bit unintuitive. Signed-off-by: Takahiro Itazuri --- src/vmm/src/vstate/vm/aarch64.rs | 2 +- src/vmm/src/vstate/vm/mod.rs | 4 ++-- src/vmm/src/vstate/vm/x86_64.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vmm/src/vstate/vm/aarch64.rs b/src/vmm/src/vstate/vm/aarch64.rs index a059803ba97..60ebf4d87d1 100644 --- a/src/vmm/src/vstate/vm/aarch64.rs +++ b/src/vmm/src/vstate/vm/aarch64.rs @@ -30,7 +30,7 @@ pub enum ArchVmError { impl ArchVm { /// Create a new `Vm` struct. pub fn new(kvm: &Kvm) -> Result { - let fd = kvm.fd.create_vm().map_err(VmError::VmFd)?; + let fd = kvm.fd.create_vm().map_err(VmError::CreateVm)?; Ok(ArchVm { fd, irqchip_handle: None, diff --git a/src/vmm/src/vstate/vm/mod.rs b/src/vmm/src/vstate/vm/mod.rs index c9c0e7e7c2f..f8345d2ab02 100644 --- a/src/vmm/src/vstate/vm/mod.rs +++ b/src/vmm/src/vstate/vm/mod.rs @@ -30,8 +30,8 @@ use crate::Vcpu; pub enum VmError { /// Cannot set the memory regions: {0} SetUserMemoryRegion(kvm_ioctls::Error), - /// Cannot open the VM file descriptor: {0} - VmFd(kvm_ioctls::Error), + /// Failed to create VM: {0} + CreateVm(kvm_ioctls::Error), /// {0} Arch(#[from] ArchVmError), /// Error during eventfd operations: {0} diff --git a/src/vmm/src/vstate/vm/x86_64.rs b/src/vmm/src/vstate/vm/x86_64.rs index f44e825b6f9..54b1fe41723 100644 --- a/src/vmm/src/vstate/vm/x86_64.rs +++ b/src/vmm/src/vstate/vm/x86_64.rs @@ -53,7 +53,7 @@ pub struct ArchVm { impl ArchVm { /// Create a new `Vm` struct. pub fn new(kvm: &crate::vstate::kvm::Kvm) -> Result { - let fd = kvm.fd.create_vm().map_err(VmError::VmFd)?; + let fd = kvm.fd.create_vm().map_err(VmError::CreateVm)?; let msrs_to_save = kvm.msrs_to_save().map_err(ArchVmError::GetMsrsToSave)?; fd.set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS)) From 5e0efad639ad262be3f64b03c9fc5ac4c9221162 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Fri, 21 Feb 2025 13:36:14 +0000 Subject: [PATCH 2/2] fix: Retry KVM_CREATE_VM on EINTR It is known that KVM_CREATE_VM fails with EINTR on heavily loaded machines with many VMs. It might be a kernel bug but apparently has not been fixed. To mitigate it, QEMU does an infinitely retry on EINTR. Similar, do retries up to 5 times. Signed-off-by: Takahiro Itazuri --- src/vmm/src/vstate/vm/aarch64.rs | 2 +- src/vmm/src/vstate/vm/mod.rs | 37 ++++++++++++++++++++++++++++++++ src/vmm/src/vstate/vm/x86_64.rs | 3 ++- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/vstate/vm/aarch64.rs b/src/vmm/src/vstate/vm/aarch64.rs index 60ebf4d87d1..6aee3d8e609 100644 --- a/src/vmm/src/vstate/vm/aarch64.rs +++ b/src/vmm/src/vstate/vm/aarch64.rs @@ -30,7 +30,7 @@ pub enum ArchVmError { impl ArchVm { /// Create a new `Vm` struct. pub fn new(kvm: &Kvm) -> Result { - let fd = kvm.fd.create_vm().map_err(VmError::CreateVm)?; + let fd = Self::create_vm(kvm)?; Ok(ArchVm { fd, irqchip_handle: None, diff --git a/src/vmm/src/vstate/vm/mod.rs b/src/vmm/src/vstate/vm/mod.rs index f8345d2ab02..5f4c7bb53c5 100644 --- a/src/vmm/src/vstate/vm/mod.rs +++ b/src/vmm/src/vstate/vm/mod.rs @@ -9,6 +9,7 @@ use kvm_bindings::{kvm_userspace_memory_region, KVM_MEM_LOG_DIRTY_PAGES}; use kvm_ioctls::VmFd; use vmm_sys_util::eventfd::EventFd; +use crate::logger::info; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; #[cfg(target_arch = "x86_64")] @@ -42,6 +43,42 @@ pub enum VmError { /// Contains Vm functions that are usable across CPU architectures impl Vm { + fn create_vm(kvm: &crate::vstate::kvm::Kvm) -> Result { + // It is known that KVM_CREATE_VM occasionally fails with EINTR on heavily loaded machines + // with many VMs. + // + // The behavior itself that KVM_CREATE_VM can return EINTR is intentional. This is because + // the KVM_CREATE_VM path includes mm_take_all_locks() that is CPU intensive and all CPU + // intensive syscalls should check for pending signals and return EINTR immediately to allow + // userland to remain interactive. + // https://lists.nongnu.org/archive/html/qemu-devel/2014-01/msg01740.html + // + // However, it is empirically confirmed that, even though there is no pending signal, + // KVM_CREATE_VM returns EINTR. + // https://lore.kernel.org/qemu-devel/8735e0s1zw.wl-maz@kernel.org/ + // + // To mitigate it, QEMU does an inifinite retry on EINTR that greatly improves reliabiliy: + // - https://github.com/qemu/qemu/commit/94ccff133820552a859c0fb95e33a539e0b90a75 + // - https://github.com/qemu/qemu/commit/bbde13cd14ad4eec18529ce0bf5876058464e124 + // + // Similarly, we do retries up to 5 times. Although Firecracker clients are also able to + // retry, they have to start Firecracker from scratch. Doing retries in Firecracker makes + // recovery faster and improves reliability. + const MAX_ATTEMPTS: u32 = 5; + for attempt in 1..=MAX_ATTEMPTS { + match kvm.fd.create_vm() { + Ok(fd) => return Ok(fd), + Err(e) if e.errno() == libc::EINTR && attempt < MAX_ATTEMPTS => { + info!("Attempt #{attempt} of KVM_CREATE_VM returned EINTR"); + // Exponential backoff (1us, 2us, 4us, and 8us => 15us in total) + std::thread::sleep(std::time::Duration::from_micros(2u64.pow(attempt - 1))); + } + Err(e) => return Err(VmError::CreateVm(e)), + } + } + unreachable!(); + } + /// Creates the specified number of [`Vcpu`]s. /// /// The returned [`EventFd`] is written to whenever any of the vcpus exit. diff --git a/src/vmm/src/vstate/vm/x86_64.rs b/src/vmm/src/vstate/vm/x86_64.rs index 54b1fe41723..c68e3c3fb1c 100644 --- a/src/vmm/src/vstate/vm/x86_64.rs +++ b/src/vmm/src/vstate/vm/x86_64.rs @@ -53,7 +53,8 @@ pub struct ArchVm { impl ArchVm { /// Create a new `Vm` struct. pub fn new(kvm: &crate::vstate::kvm::Kvm) -> Result { - let fd = kvm.fd.create_vm().map_err(VmError::CreateVm)?; + let fd = Self::create_vm(kvm)?; + let msrs_to_save = kvm.msrs_to_save().map_err(ArchVmError::GetMsrsToSave)?; fd.set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS))