Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,14 @@ and this project adheres to
This is to guarantee that the vCPU will continue receiving TSC interrupts
after restoring from the snapshot even if an interrupt is lost when taking a
snapshot.
- [#4666](https://github.com/firecracker-microvm/firecracker/pull/4666): Fixed
Firecracker sometimes restoring `MSR_IA32_TSC_DEADLINE` before `MSR_IA32_TSC`.
Now it always restores `MSR_IA32_TSC_DEADLINE` MSR after `MSR_IA32_TSC`, as
KVM relies on the guest TSC for correct restoration of
`MSR_IA32_TSC_DEADLINE`. This fixed guests using the `TSC_DEADLINE` hardware
feature receiving incorrect timer interrupts after snapshot restoration, which
could lead to them seemingly getting stuck in sleep-related syscalls (see also
https://github.com/firecracker-microvm/firecracker/pull/4099).

## \[1.7.0\]

Expand Down
8 changes: 4 additions & 4 deletions src/cpu-template-helper/src/template/dump/x86_64.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

use std::collections::HashMap;
use std::collections::BTreeMap;

use vmm::arch::x86_64::msr::MsrRange;
use vmm::arch_gen::x86::msr_index::*;
Expand Down Expand Up @@ -44,7 +44,7 @@ fn cpuid_to_modifiers(cpuid: &Cpuid) -> Vec<CpuidLeafModifier> {
.collect()
}

fn msrs_to_modifier(msrs: &HashMap<u32, u64>) -> Vec<RegisterModifier> {
fn msrs_to_modifier(msrs: &BTreeMap<u32, u64>) -> Vec<RegisterModifier> {
let mut msrs: Vec<RegisterModifier> = msrs
.iter()
.map(|(index, value)| msr_modifier!(*index, *value))
Expand Down Expand Up @@ -189,8 +189,8 @@ mod tests {
]
}

fn build_sample_msrs() -> HashMap<u32, u64> {
let mut map = HashMap::from([
fn build_sample_msrs() -> BTreeMap<u32, u64> {
let mut map = BTreeMap::from([
// should be sorted in the result.
(0x1, 0xffff_ffff_ffff_ffff),
(0x5, 0xffff_ffff_0000_0000),
Expand Down
4 changes: 2 additions & 2 deletions src/vmm/src/cpu_config/x86_64/cpuid/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pub fn get_vendor_id_from_host() -> Result<[u8; 12], GetCpuidError> {
}

/// Returns MSRs to be saved based on CPUID features that are enabled.
pub(crate) fn msrs_to_save_by_cpuid(cpuid: &kvm_bindings::CpuId) -> std::collections::HashSet<u32> {
pub(crate) fn msrs_to_save_by_cpuid(cpuid: &kvm_bindings::CpuId) -> Vec<u32> {
/// Memory Protection Extensions
const MPX_BITINDEX: u32 = 14;

Expand Down Expand Up @@ -81,7 +81,7 @@ pub(crate) fn msrs_to_save_by_cpuid(cpuid: &kvm_bindings::CpuId) -> std::collect
}};
}

let mut msrs = std::collections::HashSet::new();
let mut msrs = Vec::new();

// Macro used for easy definition of CPUID-MSR dependencies.
macro_rules! cpuid_msr_dep {
Expand Down
8 changes: 4 additions & 4 deletions src/vmm/src/cpu_config/x86_64/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ pub mod static_cpu_templates;
/// Module with test utils for custom CPU templates
pub mod test_utils;

use std::collections::HashMap;
use std::collections::BTreeMap;

use self::custom_cpu_template::CpuidRegister;
use super::templates::CustomCpuTemplate;
Expand All @@ -37,7 +37,7 @@ pub struct CpuConfiguration {
/// Register values as a key pair for model specific registers
/// Key: MSR address
/// Value: MSR value
pub msrs: HashMap<u32, u64>,
pub msrs: BTreeMap<u32, u64>,
}

impl CpuConfiguration {
Expand Down Expand Up @@ -187,14 +187,14 @@ mod tests {
fn supported_cpu_config() -> CpuConfiguration {
CpuConfiguration {
cpuid: build_supported_cpuid(),
msrs: HashMap::from([(0x8000, 0b1000), (0x9999, 0b1010)]),
msrs: BTreeMap::from([(0x8000, 0b1000), (0x9999, 0b1010)]),
}
}

fn unsupported_cpu_config() -> CpuConfiguration {
CpuConfiguration {
cpuid: build_supported_cpuid(),
msrs: HashMap::from([(0x8000, 0b1000), (0x8001, 0b1010)]),
msrs: BTreeMap::from([(0x8000, 0b1000), (0x8001, 0b1010)]),
}
}

Expand Down
4 changes: 3 additions & 1 deletion src/vmm/src/vstate/vcpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,8 @@ pub enum VcpuEmulation {
pub mod tests {
#![allow(clippy::undocumented_unsafe_blocks)]

#[cfg(target_arch = "x86_64")]
use std::collections::BTreeMap;
use std::sync::{Arc, Barrier, Mutex};

use linux_loader::loader::KernelLoader;
Expand Down Expand Up @@ -919,7 +921,7 @@ pub mod tests {
smt: false,
cpu_config: CpuConfiguration {
cpuid: Cpuid::try_from(_vm.supported_cpuid().clone()).unwrap(),
msrs: std::collections::HashMap::new(),
msrs: BTreeMap::new(),
},
},
)
Expand Down
117 changes: 108 additions & 9 deletions src/vmm/src/vstate/vcpu/x86_64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the THIRD-PARTY file.

use std::collections::{HashMap, HashSet};
use std::collections::BTreeMap;
use std::fmt::Debug;

use kvm_bindings::{
Expand All @@ -15,6 +15,7 @@ use kvm_bindings::{
use kvm_ioctls::{VcpuExit, VcpuFd};
use log::{error, warn};
use serde::{Deserialize, Serialize};
use utils::fam;

use crate::arch::x86_64::interrupts;
use crate::arch::x86_64::msr::{create_boot_msr_entries, MsrError};
Expand All @@ -33,6 +34,16 @@ use crate::vstate::vm::Vm;
const TSC_KHZ_TOL_NUMERATOR: i64 = 250;
const TSC_KHZ_TOL_DENOMINATOR: i64 = 1_000_000;

/// A set of MSRs that should be restored separately after all other MSRs have already been restored
const DEFERRED_MSRS: [u32; 1] = [
// MSR_IA32_TSC_DEADLINE must be restored after MSR_IA32_TSC, otherwise we risk "losing" timer
// interrupts across the snapshot restore boundary (due to KVM querying MSR_IA32_TSC upon
// writes to the TSC_DEADLINE MSR to determine whether it needs to prime a timer - if
// MSR_IA32_TSC is not initialized correctly, it can wrongly assume no timer needs to be
// primed, or the timer can be initialized with a wrong expiry).
MSR_IA32_TSC_DEADLINE,
];

/// Errors associated with the wrappers over KVM ioctls.
#[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)]
pub enum KvmVcpuError {
Expand Down Expand Up @@ -132,7 +143,9 @@ pub struct KvmVcpu {
pub fd: VcpuFd,
/// Vcpu peripherals, such as buses
pub(super) peripherals: Peripherals,
msrs_to_save: HashSet<u32>,
/// The list of MSRs to include in a VM snapshot, in the same order as KVM returned them
/// from KVM_GET_MSR_INDEX_LIST
msrs_to_save: Vec<u32>,
}

/// Vcpu peripherals
Expand Down Expand Up @@ -161,7 +174,7 @@ impl KvmVcpu {
index,
fd: kvm_vcpu,
peripherals: Default::default(),
msrs_to_save: vm.msrs_to_save().as_slice().iter().copied().collect(),
msrs_to_save: vm.msrs_to_save().as_slice().to_vec(),
})
}

Expand Down Expand Up @@ -316,6 +329,39 @@ impl KvmVcpu {
}
}

/// Looks for MSRs from the [`DEFERRED_MSRS`] array and removes them from `msr_chunks`.
/// Returns a new [`Msrs`] object containing all the removed MSRs.
///
/// We use this to capture some causal dependencies between MSRs where the relative order
/// of restoration matters (e.g. MSR_IA32_TSC must be restored before MSR_IA32_TSC_DEADLINE).
fn extract_deferred_msrs(msr_chunks: &mut [Msrs]) -> Result<Msrs, fam::Error> {
// Use 0 here as FamStructWrapper doesn't really give an equivalent of `Vec::with_capacity`,
// and if we specify something N != 0 here, then it will create a FamStructWrapper with N
// elements pre-allocated and zero'd out. Unless we then actually "fill" all those N values,
// KVM will later yell at us about invalid MSRs.
let mut deferred_msrs = Msrs::new(0)?;

for msrs in msr_chunks {
msrs.retain(|msr| {
if DEFERRED_MSRS.contains(&msr.index) {
deferred_msrs
.push(*msr)
.inspect_err(|err| {
error!(
"Failed to move MSR {} into later chunk: {:?}",
msr.index, err
)
})
.is_err()
} else {
true
}
});
}

Ok(deferred_msrs)
}

/// Get MSR chunks for the given MSR index list.
///
/// KVM only supports getting `KVM_MAX_MSR_ENTRIES` at a time, so we divide
Expand All @@ -338,7 +384,8 @@ impl KvmVcpu {
) -> Result<Vec<Msrs>, KvmVcpuError> {
let num_chunks = msr_index_iter.len().div_ceil(KVM_MAX_MSR_ENTRIES);

let mut msr_chunks: Vec<Msrs> = Vec::with_capacity(num_chunks);
// + 1 for the chunk of deferred MSRs
let mut msr_chunks: Vec<Msrs> = Vec::with_capacity(num_chunks + 1);

for _ in 0..num_chunks {
let chunk_len = msr_index_iter.len().min(KVM_MAX_MSR_ENTRIES);
Expand All @@ -348,6 +395,9 @@ impl KvmVcpu {

Self::fix_zero_tsc_deadline_msr(&mut msr_chunks);

let deferred = Self::extract_deferred_msrs(&mut msr_chunks)?;
msr_chunks.push(deferred);

Ok(msr_chunks)
}

Expand Down Expand Up @@ -407,8 +457,8 @@ impl KvmVcpu {
pub fn get_msrs(
&self,
msr_index_iter: impl ExactSizeIterator<Item = u32>,
) -> Result<HashMap<u32, u64>, KvmVcpuError> {
let mut msrs: HashMap<u32, u64> = HashMap::new();
) -> Result<BTreeMap<u32, u64>, KvmVcpuError> {
let mut msrs = BTreeMap::new();
self.get_msr_chunks(msr_index_iter)?
.iter()
.for_each(|msr_chunk| {
Expand Down Expand Up @@ -668,7 +718,7 @@ mod tests {
use std::os::unix::io::AsRawFd;

use kvm_bindings::kvm_msr_entry;
use kvm_ioctls::Cap;
use kvm_ioctls::{Cap, Kvm};

use super::*;
use crate::arch::x86_64::cpu_model::CpuModel;
Expand Down Expand Up @@ -853,7 +903,7 @@ mod tests {
smt: false,
cpu_config: CpuConfiguration {
cpuid: Cpuid::try_from(vm.supported_cpuid().clone()).unwrap(),
msrs: HashMap::new(),
msrs: BTreeMap::new(),
},
};
vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config)
Expand Down Expand Up @@ -915,7 +965,7 @@ mod tests {
smt: false,
cpu_config: CpuConfiguration {
cpuid: Cpuid::try_from(vm.supported_cpuid().clone()).unwrap(),
msrs: HashMap::new(),
msrs: BTreeMap::new(),
},
};
vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config)
Expand Down Expand Up @@ -1046,6 +1096,25 @@ mod tests {
}
}

#[test]
fn test_defer_msrs() {
let to_defer = DEFERRED_MSRS[0];

let mut msr_chunks = [msrs_from_entries(&[(to_defer, 0), (MSR_IA32_TSC, 1)])];

let deferred = KvmVcpu::extract_deferred_msrs(&mut msr_chunks).unwrap();

assert_eq!(deferred.as_slice().len(), 1, "did not correctly defer MSR");
assert_eq!(
msr_chunks[0].as_slice().len(),
1,
"deferred MSR not removed from chunk"
);

assert_eq!(deferred.as_slice()[0].index, to_defer);
assert_eq!(msr_chunks[0].as_slice()[0].index, MSR_IA32_TSC);
}

#[test]
fn test_fix_zero_tsc_deadline_msr_zero_same_chunk() {
// Place both TSC and TSC_DEADLINE MSRs in the same chunk.
Expand Down Expand Up @@ -1096,4 +1165,34 @@ mod tests {
&[(MSR_IA32_TSC_DEADLINE, 1), (MSR_IA32_TSC, 2)],
);
}

#[test]
fn test_get_msr_chunks_preserved_order() {
// Regression test for #4666

let kvm = Kvm::new().unwrap();
let vm = Vm::new(Vec::new()).unwrap();
let vcpu = KvmVcpu::new(0, &vm).unwrap();

// The list of supported MSR indices, in the order they were returned by KVM
let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm).unwrap();
// The MSRs after processing. The order should be identical to the one returned by KVM, with
// the exception of deferred MSRs, which should be moved to the end (but show up in the same
// order as they are listed in [`DEFERRED_MSRS`].
let msr_chunks = vcpu
.get_msr_chunks(vcpu.msrs_to_save.iter().copied())
.unwrap();

msr_chunks
.iter()
.flat_map(|chunk| chunk.as_slice().iter())
.zip(
msrs_to_save
.as_slice()
.iter()
.filter(|&idx| !DEFERRED_MSRS.contains(idx))
.chain(DEFERRED_MSRS.iter()),
)
.for_each(|(left, &right)| assert_eq!(left.index, right));
}
}