diff --git a/CHANGELOG.md b/CHANGELOG.md index 7781f2850ff..63bfa406af2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -90,6 +90,14 @@ and this project adheres to This is to guarantee that the vCPU will continue receiving TSC interrupts after restoring from the snapshot even if an interrupt is lost when taking a snapshot. +- [#4666](https://github.com/firecracker-microvm/firecracker/pull/4666): Fixed + Firecracker sometimes restoring `MSR_IA32_TSC_DEADLINE` before `MSR_IA32_TSC`. + Now it always restores `MSR_IA32_TSC_DEADLINE` MSR after `MSR_IA32_TSC`, as + KVM relies on the guest TSC for correct restoration of + `MSR_IA32_TSC_DEADLINE`. This fixed guests using the `TSC_DEADLINE` hardware + feature receiving incorrect timer interrupts after snapshot restoration, which + could lead to them seemingly getting stuck in sleep-related syscalls (see also + https://github.com/firecracker-microvm/firecracker/pull/4099). ## \[1.7.0\] diff --git a/src/cpu-template-helper/src/template/dump/x86_64.rs b/src/cpu-template-helper/src/template/dump/x86_64.rs index 61be42d6aa3..5299c477788 100644 --- a/src/cpu-template-helper/src/template/dump/x86_64.rs +++ b/src/cpu-template-helper/src/template/dump/x86_64.rs @@ -1,7 +1,7 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::collections::HashMap; +use std::collections::BTreeMap; use vmm::arch::x86_64::msr::MsrRange; use vmm::arch_gen::x86::msr_index::*; @@ -44,7 +44,7 @@ fn cpuid_to_modifiers(cpuid: &Cpuid) -> Vec { .collect() } -fn msrs_to_modifier(msrs: &HashMap) -> Vec { +fn msrs_to_modifier(msrs: &BTreeMap) -> Vec { let mut msrs: Vec = msrs .iter() .map(|(index, value)| msr_modifier!(*index, *value)) @@ -189,8 +189,8 @@ mod tests { ] } - fn build_sample_msrs() -> HashMap { - let mut map = HashMap::from([ + fn build_sample_msrs() -> BTreeMap { + let mut map = BTreeMap::from([ // should be sorted in the result. (0x1, 0xffff_ffff_ffff_ffff), (0x5, 0xffff_ffff_0000_0000), diff --git a/src/vmm/src/cpu_config/x86_64/cpuid/common.rs b/src/vmm/src/cpu_config/x86_64/cpuid/common.rs index 707a19dc539..9d6465af1fa 100644 --- a/src/vmm/src/cpu_config/x86_64/cpuid/common.rs +++ b/src/vmm/src/cpu_config/x86_64/cpuid/common.rs @@ -53,7 +53,7 @@ pub fn get_vendor_id_from_host() -> Result<[u8; 12], GetCpuidError> { } /// Returns MSRs to be saved based on CPUID features that are enabled. -pub(crate) fn msrs_to_save_by_cpuid(cpuid: &kvm_bindings::CpuId) -> std::collections::HashSet { +pub(crate) fn msrs_to_save_by_cpuid(cpuid: &kvm_bindings::CpuId) -> Vec { /// Memory Protection Extensions const MPX_BITINDEX: u32 = 14; @@ -81,7 +81,7 @@ pub(crate) fn msrs_to_save_by_cpuid(cpuid: &kvm_bindings::CpuId) -> std::collect }}; } - let mut msrs = std::collections::HashSet::new(); + let mut msrs = Vec::new(); // Macro used for easy definition of CPUID-MSR dependencies. macro_rules! cpuid_msr_dep { diff --git a/src/vmm/src/cpu_config/x86_64/mod.rs b/src/vmm/src/cpu_config/x86_64/mod.rs index 13824880bbb..3a26c621194 100644 --- a/src/vmm/src/cpu_config/x86_64/mod.rs +++ b/src/vmm/src/cpu_config/x86_64/mod.rs @@ -10,7 +10,7 @@ pub mod static_cpu_templates; /// Module with test utils for custom CPU templates pub mod test_utils; -use std::collections::HashMap; +use std::collections::BTreeMap; use self::custom_cpu_template::CpuidRegister; use super::templates::CustomCpuTemplate; @@ -37,7 +37,7 @@ pub struct CpuConfiguration { /// Register values as a key pair for model specific registers /// Key: MSR address /// Value: MSR value - pub msrs: HashMap, + pub msrs: BTreeMap, } impl CpuConfiguration { @@ -187,14 +187,14 @@ mod tests { fn supported_cpu_config() -> CpuConfiguration { CpuConfiguration { cpuid: build_supported_cpuid(), - msrs: HashMap::from([(0x8000, 0b1000), (0x9999, 0b1010)]), + msrs: BTreeMap::from([(0x8000, 0b1000), (0x9999, 0b1010)]), } } fn unsupported_cpu_config() -> CpuConfiguration { CpuConfiguration { cpuid: build_supported_cpuid(), - msrs: HashMap::from([(0x8000, 0b1000), (0x8001, 0b1010)]), + msrs: BTreeMap::from([(0x8000, 0b1000), (0x8001, 0b1010)]), } } diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs index b1b1e2c581e..ccaeaa54407 100644 --- a/src/vmm/src/vstate/vcpu/mod.rs +++ b/src/vmm/src/vstate/vcpu/mod.rs @@ -694,6 +694,8 @@ pub enum VcpuEmulation { pub mod tests { #![allow(clippy::undocumented_unsafe_blocks)] + #[cfg(target_arch = "x86_64")] + use std::collections::BTreeMap; use std::sync::{Arc, Barrier, Mutex}; use linux_loader::loader::KernelLoader; @@ -919,7 +921,7 @@ pub mod tests { smt: false, cpu_config: CpuConfiguration { cpuid: Cpuid::try_from(_vm.supported_cpuid().clone()).unwrap(), - msrs: std::collections::HashMap::new(), + msrs: BTreeMap::new(), }, }, ) diff --git a/src/vmm/src/vstate/vcpu/x86_64.rs b/src/vmm/src/vstate/vcpu/x86_64.rs index 9f4f55e57fb..d91bfd1cea2 100644 --- a/src/vmm/src/vstate/vcpu/x86_64.rs +++ b/src/vmm/src/vstate/vcpu/x86_64.rs @@ -5,7 +5,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. -use std::collections::{HashMap, HashSet}; +use std::collections::BTreeMap; use std::fmt::Debug; use kvm_bindings::{ @@ -15,6 +15,7 @@ use kvm_bindings::{ use kvm_ioctls::{VcpuExit, VcpuFd}; use log::{error, warn}; use serde::{Deserialize, Serialize}; +use utils::fam; use crate::arch::x86_64::interrupts; use crate::arch::x86_64::msr::{create_boot_msr_entries, MsrError}; @@ -33,6 +34,16 @@ use crate::vstate::vm::Vm; const TSC_KHZ_TOL_NUMERATOR: i64 = 250; const TSC_KHZ_TOL_DENOMINATOR: i64 = 1_000_000; +/// A set of MSRs that should be restored separately after all other MSRs have already been restored +const DEFERRED_MSRS: [u32; 1] = [ + // MSR_IA32_TSC_DEADLINE must be restored after MSR_IA32_TSC, otherwise we risk "losing" timer + // interrupts across the snapshot restore boundary (due to KVM querying MSR_IA32_TSC upon + // writes to the TSC_DEADLINE MSR to determine whether it needs to prime a timer - if + // MSR_IA32_TSC is not initialized correctly, it can wrongly assume no timer needs to be + // primed, or the timer can be initialized with a wrong expiry). + MSR_IA32_TSC_DEADLINE, +]; + /// Errors associated with the wrappers over KVM ioctls. #[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)] pub enum KvmVcpuError { @@ -132,7 +143,9 @@ pub struct KvmVcpu { pub fd: VcpuFd, /// Vcpu peripherals, such as buses pub(super) peripherals: Peripherals, - msrs_to_save: HashSet, + /// The list of MSRs to include in a VM snapshot, in the same order as KVM returned them + /// from KVM_GET_MSR_INDEX_LIST + msrs_to_save: Vec, } /// Vcpu peripherals @@ -161,7 +174,7 @@ impl KvmVcpu { index, fd: kvm_vcpu, peripherals: Default::default(), - msrs_to_save: vm.msrs_to_save().as_slice().iter().copied().collect(), + msrs_to_save: vm.msrs_to_save().as_slice().to_vec(), }) } @@ -316,6 +329,39 @@ impl KvmVcpu { } } + /// Looks for MSRs from the [`DEFERRED_MSRS`] array and removes them from `msr_chunks`. + /// Returns a new [`Msrs`] object containing all the removed MSRs. + /// + /// We use this to capture some causal dependencies between MSRs where the relative order + /// of restoration matters (e.g. MSR_IA32_TSC must be restored before MSR_IA32_TSC_DEADLINE). + fn extract_deferred_msrs(msr_chunks: &mut [Msrs]) -> Result { + // Use 0 here as FamStructWrapper doesn't really give an equivalent of `Vec::with_capacity`, + // and if we specify something N != 0 here, then it will create a FamStructWrapper with N + // elements pre-allocated and zero'd out. Unless we then actually "fill" all those N values, + // KVM will later yell at us about invalid MSRs. + let mut deferred_msrs = Msrs::new(0)?; + + for msrs in msr_chunks { + msrs.retain(|msr| { + if DEFERRED_MSRS.contains(&msr.index) { + deferred_msrs + .push(*msr) + .inspect_err(|err| { + error!( + "Failed to move MSR {} into later chunk: {:?}", + msr.index, err + ) + }) + .is_err() + } else { + true + } + }); + } + + Ok(deferred_msrs) + } + /// Get MSR chunks for the given MSR index list. /// /// KVM only supports getting `KVM_MAX_MSR_ENTRIES` at a time, so we divide @@ -338,7 +384,8 @@ impl KvmVcpu { ) -> Result, KvmVcpuError> { let num_chunks = msr_index_iter.len().div_ceil(KVM_MAX_MSR_ENTRIES); - let mut msr_chunks: Vec = Vec::with_capacity(num_chunks); + // + 1 for the chunk of deferred MSRs + let mut msr_chunks: Vec = Vec::with_capacity(num_chunks + 1); for _ in 0..num_chunks { let chunk_len = msr_index_iter.len().min(KVM_MAX_MSR_ENTRIES); @@ -348,6 +395,9 @@ impl KvmVcpu { Self::fix_zero_tsc_deadline_msr(&mut msr_chunks); + let deferred = Self::extract_deferred_msrs(&mut msr_chunks)?; + msr_chunks.push(deferred); + Ok(msr_chunks) } @@ -407,8 +457,8 @@ impl KvmVcpu { pub fn get_msrs( &self, msr_index_iter: impl ExactSizeIterator, - ) -> Result, KvmVcpuError> { - let mut msrs: HashMap = HashMap::new(); + ) -> Result, KvmVcpuError> { + let mut msrs = BTreeMap::new(); self.get_msr_chunks(msr_index_iter)? .iter() .for_each(|msr_chunk| { @@ -668,7 +718,7 @@ mod tests { use std::os::unix::io::AsRawFd; use kvm_bindings::kvm_msr_entry; - use kvm_ioctls::Cap; + use kvm_ioctls::{Cap, Kvm}; use super::*; use crate::arch::x86_64::cpu_model::CpuModel; @@ -853,7 +903,7 @@ mod tests { smt: false, cpu_config: CpuConfiguration { cpuid: Cpuid::try_from(vm.supported_cpuid().clone()).unwrap(), - msrs: HashMap::new(), + msrs: BTreeMap::new(), }, }; vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config) @@ -915,7 +965,7 @@ mod tests { smt: false, cpu_config: CpuConfiguration { cpuid: Cpuid::try_from(vm.supported_cpuid().clone()).unwrap(), - msrs: HashMap::new(), + msrs: BTreeMap::new(), }, }; vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config) @@ -1046,6 +1096,25 @@ mod tests { } } + #[test] + fn test_defer_msrs() { + let to_defer = DEFERRED_MSRS[0]; + + let mut msr_chunks = [msrs_from_entries(&[(to_defer, 0), (MSR_IA32_TSC, 1)])]; + + let deferred = KvmVcpu::extract_deferred_msrs(&mut msr_chunks).unwrap(); + + assert_eq!(deferred.as_slice().len(), 1, "did not correctly defer MSR"); + assert_eq!( + msr_chunks[0].as_slice().len(), + 1, + "deferred MSR not removed from chunk" + ); + + assert_eq!(deferred.as_slice()[0].index, to_defer); + assert_eq!(msr_chunks[0].as_slice()[0].index, MSR_IA32_TSC); + } + #[test] fn test_fix_zero_tsc_deadline_msr_zero_same_chunk() { // Place both TSC and TSC_DEADLINE MSRs in the same chunk. @@ -1096,4 +1165,34 @@ mod tests { &[(MSR_IA32_TSC_DEADLINE, 1), (MSR_IA32_TSC, 2)], ); } + + #[test] + fn test_get_msr_chunks_preserved_order() { + // Regression test for #4666 + + let kvm = Kvm::new().unwrap(); + let vm = Vm::new(Vec::new()).unwrap(); + let vcpu = KvmVcpu::new(0, &vm).unwrap(); + + // The list of supported MSR indices, in the order they were returned by KVM + let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm).unwrap(); + // The MSRs after processing. The order should be identical to the one returned by KVM, with + // the exception of deferred MSRs, which should be moved to the end (but show up in the same + // order as they are listed in [`DEFERRED_MSRS`]. + let msr_chunks = vcpu + .get_msr_chunks(vcpu.msrs_to_save.iter().copied()) + .unwrap(); + + msr_chunks + .iter() + .flat_map(|chunk| chunk.as_slice().iter()) + .zip( + msrs_to_save + .as_slice() + .iter() + .filter(|&idx| !DEFERRED_MSRS.contains(idx)) + .chain(DEFERRED_MSRS.iter()), + ) + .for_each(|(left, &right)| assert_eq!(left.index, right)); + } }