firecracker-microvm · pb8o · Jul 8, 2024 · Jul 5, 2024 · Jul 8, 2024 · Jul 8, 2024
@@ -90,6 +90,14 @@ and this project adheres to
   This is to guarantee that the vCPU will continue receiving TSC interrupts
   after restoring from the snapshot even if an interrupt is lost when taking a
   snapshot.
+- [#4666](https://github.com/firecracker-microvm/firecracker/pull/4666): Fixed
+  Firecracker sometimes restoring `MSR_IA32_TSC_DEADLINE` before `MSR_IA32_TSC`.
+  Now it always restores `MSR_IA32_TSC_DEADLINE` MSR after `MSR_IA32_TSC`, as
+  KVM relies on the guest TSC for correct restoration of
+  `MSR_IA32_TSC_DEADLINE`. This fixed guests using the `TSC_DEADLINE` hardware
+  feature receiving incorrect timer interrupts after snapshot restoration, which
+  could lead to them seemingly getting stuck in sleep-related syscalls (see also
+  https://github.com/firecracker-microvm/firecracker/pull/4099).
 
 ## \[1.7.0\]
 

diff --git a/src/cpu-template-helper/src/template/dump/x86_64.rs b/src/cpu-template-helper/src/template/dump/x86_64.rs
@@ -1,7 +1,7 @@
 // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-use std::collections::HashMap;
+use std::collections::BTreeMap;
 
 use vmm::arch::x86_64::msr::MsrRange;
 use vmm::arch_gen::x86::msr_index::*;
@@ -44,7 +44,7 @@ fn cpuid_to_modifiers(cpuid: &Cpuid) -> Vec<CpuidLeafModifier> {
         .collect()
 }
 
-fn msrs_to_modifier(msrs: &HashMap<u32, u64>) -> Vec<RegisterModifier> {
+fn msrs_to_modifier(msrs: &BTreeMap<u32, u64>) -> Vec<RegisterModifier> {
     let mut msrs: Vec<RegisterModifier> = msrs
         .iter()
         .map(|(index, value)| msr_modifier!(*index, *value))
@@ -189,8 +189,8 @@ mod tests {
         ]
     }
 
-    fn build_sample_msrs() -> HashMap<u32, u64> {
-        let mut map = HashMap::from([
+    fn build_sample_msrs() -> BTreeMap<u32, u64> {
+        let mut map = BTreeMap::from([
             // should be sorted in the result.
             (0x1, 0xffff_ffff_ffff_ffff),
             (0x5, 0xffff_ffff_0000_0000),

diff --git a/src/vmm/src/cpu_config/x86_64/cpuid/common.rs b/src/vmm/src/cpu_config/x86_64/cpuid/common.rs
@@ -53,7 +53,7 @@ pub fn get_vendor_id_from_host() -> Result<[u8; 12], GetCpuidError> {
 }
 
 /// Returns MSRs to be saved based on CPUID features that are enabled.
-pub(crate) fn msrs_to_save_by_cpuid(cpuid: &kvm_bindings::CpuId) -> std::collections::HashSet<u32> {
+pub(crate) fn msrs_to_save_by_cpuid(cpuid: &kvm_bindings::CpuId) -> Vec<u32> {
     /// Memory Protection Extensions
     const MPX_BITINDEX: u32 = 14;
 
@@ -81,7 +81,7 @@ pub(crate) fn msrs_to_save_by_cpuid(cpuid: &kvm_bindings::CpuId) -> std::collect
         }};
     }
 
-    let mut msrs = std::collections::HashSet::new();
+    let mut msrs = Vec::new();
 
     // Macro used for easy definition of CPUID-MSR dependencies.
     macro_rules! cpuid_msr_dep {

diff --git a/src/vmm/src/cpu_config/x86_64/mod.rs b/src/vmm/src/cpu_config/x86_64/mod.rs
@@ -10,7 +10,7 @@ pub mod static_cpu_templates;
 /// Module with test utils for custom CPU templates
 pub mod test_utils;
 
-use std::collections::HashMap;
+use std::collections::BTreeMap;
 
 use self::custom_cpu_template::CpuidRegister;
 use super::templates::CustomCpuTemplate;
@@ -37,7 +37,7 @@ pub struct CpuConfiguration {
     /// Register values as a key pair for model specific registers
     /// Key: MSR address
     /// Value: MSR value
-    pub msrs: HashMap<u32, u64>,
+    pub msrs: BTreeMap<u32, u64>,
 }
 
 impl CpuConfiguration {
@@ -187,14 +187,14 @@ mod tests {
     fn supported_cpu_config() -> CpuConfiguration {
         CpuConfiguration {
             cpuid: build_supported_cpuid(),
-            msrs: HashMap::from([(0x8000, 0b1000), (0x9999, 0b1010)]),
+            msrs: BTreeMap::from([(0x8000, 0b1000), (0x9999, 0b1010)]),
         }
     }
 
     fn unsupported_cpu_config() -> CpuConfiguration {
         CpuConfiguration {
             cpuid: build_supported_cpuid(),
-            msrs: HashMap::from([(0x8000, 0b1000), (0x8001, 0b1010)]),
+            msrs: BTreeMap::from([(0x8000, 0b1000), (0x8001, 0b1010)]),
         }
     }
 

diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs
@@ -694,6 +694,8 @@ pub enum VcpuEmulation {
 pub mod tests {
     #![allow(clippy::undocumented_unsafe_blocks)]
 
+    #[cfg(target_arch = "x86_64")]
+    use std::collections::BTreeMap;
     use std::sync::{Arc, Barrier, Mutex};
 
     use linux_loader::loader::KernelLoader;
@@ -919,7 +921,7 @@ pub mod tests {
                         smt: false,
                         cpu_config: CpuConfiguration {
                             cpuid: Cpuid::try_from(_vm.supported_cpuid().clone()).unwrap(),
-                            msrs: std::collections::HashMap::new(),
+                            msrs: BTreeMap::new(),
                         },
                     },
                 )

diff --git a/src/vmm/src/vstate/vcpu/x86_64.rs b/src/vmm/src/vstate/vcpu/x86_64.rs
@@ -5,7 +5,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the THIRD-PARTY file.
 
-use std::collections::{HashMap, HashSet};
+use std::collections::BTreeMap;
 use std::fmt::Debug;
 
 use kvm_bindings::{
@@ -15,6 +15,7 @@ use kvm_bindings::{
 use kvm_ioctls::{VcpuExit, VcpuFd};
 use log::{error, warn};
 use serde::{Deserialize, Serialize};
+use utils::fam;
 
 use crate::arch::x86_64::interrupts;
 use crate::arch::x86_64::msr::{create_boot_msr_entries, MsrError};
@@ -33,6 +34,16 @@ use crate::vstate::vm::Vm;
 const TSC_KHZ_TOL_NUMERATOR: i64 = 250;
 const TSC_KHZ_TOL_DENOMINATOR: i64 = 1_000_000;
 
+/// A set of MSRs that should be restored separately after all other MSRs have already been restored
+const DEFERRED_MSRS: [u32; 1] = [
+    // MSR_IA32_TSC_DEADLINE must be restored after MSR_IA32_TSC, otherwise we risk "losing" timer
+    // interrupts across the snapshot restore boundary (due to KVM querying MSR_IA32_TSC upon
+    // writes to the TSC_DEADLINE MSR to determine whether it needs to prime a timer - if
+    // MSR_IA32_TSC is not initialized correctly, it can wrongly assume no timer needs to be
+    // primed, or the timer can be initialized with a wrong expiry).
+    MSR_IA32_TSC_DEADLINE,
+];
+
 /// Errors associated with the wrappers over KVM ioctls.
 #[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)]
 pub enum KvmVcpuError {
@@ -132,7 +143,9 @@ pub struct KvmVcpu {
     pub fd: VcpuFd,
     /// Vcpu peripherals, such as buses
     pub(super) peripherals: Peripherals,
-    msrs_to_save: HashSet<u32>,
+    /// The list of MSRs to include in a VM snapshot, in the same order as KVM returned them
+    /// from KVM_GET_MSR_INDEX_LIST
+    msrs_to_save: Vec<u32>,
 }
 
 /// Vcpu peripherals
@@ -161,7 +174,7 @@ impl KvmVcpu {
             index,
             fd: kvm_vcpu,
             peripherals: Default::default(),
-            msrs_to_save: vm.msrs_to_save().as_slice().iter().copied().collect(),
+            msrs_to_save: vm.msrs_to_save().as_slice().to_vec(),
         })
     }
 
@@ -316,6 +329,39 @@ impl KvmVcpu {
         }
     }
 
+    /// Looks for MSRs from the [`DEFERRED_MSRS`] array and removes them from `msr_chunks`.
+    /// Returns a new [`Msrs`] object containing all the removed MSRs.
+    ///
+    /// We use this to capture some causal dependencies between MSRs where the relative order
+    /// of restoration matters (e.g. MSR_IA32_TSC must be restored before MSR_IA32_TSC_DEADLINE).
+    fn extract_deferred_msrs(msr_chunks: &mut [Msrs]) -> Result<Msrs, fam::Error> {
+        // Use 0 here as FamStructWrapper doesn't really give an equivalent of `Vec::with_capacity`,
+        // and if we specify something N != 0 here, then it will create a FamStructWrapper with N
+        // elements pre-allocated and zero'd out. Unless we then actually "fill" all those N values,
+        // KVM will later yell at us about invalid MSRs.
+        let mut deferred_msrs = Msrs::new(0)?;
+
+        for msrs in msr_chunks {
+            msrs.retain(|msr| {
+                if DEFERRED_MSRS.contains(&msr.index) {
+                    deferred_msrs
+                        .push(*msr)
+                        .inspect_err(|err| {
+                            error!(
+                                "Failed to move MSR {} into later chunk: {:?}",
+                                msr.index, err
+                            )
+                        })
+                        .is_err()
+                } else {
+                    true
+                }
+            });
+        }
+
+        Ok(deferred_msrs)
+    }
+
     /// Get MSR chunks for the given MSR index list.
     ///
     /// KVM only supports getting `KVM_MAX_MSR_ENTRIES` at a time, so we divide
@@ -338,7 +384,8 @@ impl KvmVcpu {
     ) -> Result<Vec<Msrs>, KvmVcpuError> {
         let num_chunks = msr_index_iter.len().div_ceil(KVM_MAX_MSR_ENTRIES);
 
-        let mut msr_chunks: Vec<Msrs> = Vec::with_capacity(num_chunks);
+        // + 1 for the chunk of deferred MSRs
+        let mut msr_chunks: Vec<Msrs> = Vec::with_capacity(num_chunks + 1);
 
         for _ in 0..num_chunks {
             let chunk_len = msr_index_iter.len().min(KVM_MAX_MSR_ENTRIES);
@@ -348,6 +395,9 @@ impl KvmVcpu {
 
         Self::fix_zero_tsc_deadline_msr(&mut msr_chunks);
 
+        let deferred = Self::extract_deferred_msrs(&mut msr_chunks)?;
+        msr_chunks.push(deferred);
+
         Ok(msr_chunks)
     }
 
@@ -407,8 +457,8 @@ impl KvmVcpu {
     pub fn get_msrs(
         &self,
         msr_index_iter: impl ExactSizeIterator<Item = u32>,
-    ) -> Result<HashMap<u32, u64>, KvmVcpuError> {
-        let mut msrs: HashMap<u32, u64> = HashMap::new();
+    ) -> Result<BTreeMap<u32, u64>, KvmVcpuError> {
+        let mut msrs = BTreeMap::new();
         self.get_msr_chunks(msr_index_iter)?
             .iter()
             .for_each(|msr_chunk| {
@@ -668,7 +718,7 @@ mod tests {
     use std::os::unix::io::AsRawFd;
 
     use kvm_bindings::kvm_msr_entry;
-    use kvm_ioctls::Cap;
+    use kvm_ioctls::{Cap, Kvm};
 
     use super::*;
     use crate::arch::x86_64::cpu_model::CpuModel;
@@ -853,7 +903,7 @@ mod tests {
             smt: false,
             cpu_config: CpuConfiguration {
                 cpuid: Cpuid::try_from(vm.supported_cpuid().clone()).unwrap(),
-                msrs: HashMap::new(),
+                msrs: BTreeMap::new(),
             },
         };
         vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config)
@@ -915,7 +965,7 @@ mod tests {
             smt: false,
             cpu_config: CpuConfiguration {
                 cpuid: Cpuid::try_from(vm.supported_cpuid().clone()).unwrap(),
-                msrs: HashMap::new(),
+                msrs: BTreeMap::new(),
             },
         };
         vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config)
@@ -1046,6 +1096,25 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_defer_msrs() {
+        let to_defer = DEFERRED_MSRS[0];
+
+        let mut msr_chunks = [msrs_from_entries(&[(to_defer, 0), (MSR_IA32_TSC, 1)])];
+
+        let deferred = KvmVcpu::extract_deferred_msrs(&mut msr_chunks).unwrap();
+
+        assert_eq!(deferred.as_slice().len(), 1, "did not correctly defer MSR");
+        assert_eq!(
+            msr_chunks[0].as_slice().len(),
+            1,
+            "deferred MSR not removed from chunk"
+        );
+
+        assert_eq!(deferred.as_slice()[0].index, to_defer);
+        assert_eq!(msr_chunks[0].as_slice()[0].index, MSR_IA32_TSC);
+    }
+
     #[test]
     fn test_fix_zero_tsc_deadline_msr_zero_same_chunk() {
         // Place both TSC and TSC_DEADLINE MSRs in the same chunk.
@@ -1096,4 +1165,34 @@ mod tests {
             &[(MSR_IA32_TSC_DEADLINE, 1), (MSR_IA32_TSC, 2)],
         );
     }
+
+    #[test]
+    fn test_get_msr_chunks_preserved_order() {
+        // Regression test for #4666
+
+        let kvm = Kvm::new().unwrap();
+        let vm = Vm::new(Vec::new()).unwrap();
+        let vcpu = KvmVcpu::new(0, &vm).unwrap();
+
+        // The list of supported MSR indices, in the order they were returned by KVM
+        let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm).unwrap();
+        // The MSRs after processing. The order should be identical to the one returned by KVM, with
+        // the exception of deferred MSRs, which should be moved to the end (but show up in the same
+        // order as they are listed in [`DEFERRED_MSRS`].
+        let msr_chunks = vcpu
+            .get_msr_chunks(vcpu.msrs_to_save.iter().copied())
+            .unwrap();
+
+        msr_chunks
+            .iter()
+            .flat_map(|chunk| chunk.as_slice().iter())
+            .zip(
+                msrs_to_save
+                    .as_slice()
+                    .iter()
+                    .filter(|&idx| !DEFERRED_MSRS.contains(idx))
+                    .chain(DEFERRED_MSRS.iter()),
+            )
+            .for_each(|(left, &right)| assert_eq!(left.index, right));
+    }
 }