diff --git a/CHANGELOG.md b/CHANGELOG.md index c6293caa831..2b472acf2b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ and this project adheres to `VcpuExit::MmioRead`, `VcpuExit::MmioWrite`, `VcpuExit::IoIn` and `VcpuExit::IoOut`. The average for these VM exits is not emitted since it can be deduced from the available emitted metrics. +- [#4360](https://github.com/firecracker-microvm/firecracker/pull/4360): Added + dev-preview support for backing a VM's guest memory by 2M hugetlbfs pages. + Please see the [documentation](docs/hugepages.md) for more information. ### Changed diff --git a/docs/hugepages.md b/docs/hugepages.md new file mode 100644 index 00000000000..a5105cc802b --- /dev/null +++ b/docs/hugepages.md @@ -0,0 +1,55 @@ +# Backing Guest Memory by Huge Pages + +> \[!WARNING\] +> +> Support is currently in **developer preview**. See +> [this section](RELEASE_POLICY.md#developer-preview-features) for more info. + +Firecracker supports backing the guest memory of a VM by 2MB hugetlbfs pages. +This can be enabled by setting the `huge_pages` field of `PUT` or `PATCH` +requests to the `/machine-config` endpoint to `2M`. + +Backing guest memory by huge pages can bring performance improvements for +specific workloads, due to less TLB contention and less overhead during +virtual->physical address resolution. It can also help reduce the number of +KVM_EXITS required to rebuild extended page tables post snapshot restore, as +well as improve boot times (by up to 50% as measured by Firecracker's +[boot time performance tests](../tests/integration_tests/performance/test_boottime.py)) + +Using hugetlbfs requires the host running Firecracker to have a pre-allocated +pool of 2M pages. Should this pool be too small, Firecracker may behave +erratically or receive the `SIGBUS` signal. This is because Firecracker uses the +`MAP_NORESERVE` flag when mapping guest memory. This flag means the kernel will +not try to reserve sufficient hugetlbfs pages at the time of the `mmap` call, +trying to claim them from the pool on-demand. For details on how to manage this +pool, please refer to the [Linux Documentation][hugetlbfs_docs]. + +## Huge Pages and Snapshotting + +Restoring a Firecracker snapshot of a microVM backed by huge pages will also use +huge pages to back the restored guest. There is no option to flip between +regular, 4K, pages and huge pages at restore time. Furthermore, snapshots of +microVMs backed with huge pages can only be restored via UFFD. Lastly, note that +even for guests backed by huge pages, differential snapshots will always track +write accesses to guest memory at 4K granularity. + +## Known Limitations + +Currently, hugetlbfs support is mutually exclusive with the following +Firecracker features: + +- Memory Ballooning via the [Balloon Device](./ballooning.md) +- Initrd + +## FAQ + +### Why does Firecracker not offer a transparent huge pages (THP) setting? + +Firecracker's guest memory is memfd based. Linux (as of 6.1) does not offer a +way to dynamically enable THP for such memory regions. Additionally, UFFD does +not integrate with THP (no transparent huge pages will be allocated during +userfaulting). Please refer to the [Linux Documentation][thp_docs] for more +information. + +[hugetlbfs_docs]: https://docs.kernel.org/admin-guide/mm/hugetlbpage.html +[thp_docs]: https://www.kernel.org/doc/html/next/admin-guide/mm/transhuge.html#hugepages-in-tmpfs-shmem diff --git a/docs/snapshotting/handling-page-faults-on-snapshot-resume.md b/docs/snapshotting/handling-page-faults-on-snapshot-resume.md index 9f7d9314091..d699c5d24ee 100644 --- a/docs/snapshotting/handling-page-faults-on-snapshot-resume.md +++ b/docs/snapshotting/handling-page-faults-on-snapshot-resume.md @@ -161,7 +161,7 @@ connect/send data. ### Example An example of a handler process can be found -[here](../../src/firecracker/examples/uffd/valid_handler.rs). The process is +[here](../../src/firecracker/examples/uffd/valid_4k_handler.rs). The process is designed to tackle faults on a certain address by loading into memory the entire region that the address belongs to, but users can choose any other behavior that suits their use case best. diff --git a/resources/overlay/usr/local/bin/fast_page_fault_helper.c b/resources/overlay/usr/local/bin/fast_page_fault_helper.c new file mode 100644 index 00000000000..d304b97f94d --- /dev/null +++ b/resources/overlay/usr/local/bin/fast_page_fault_helper.c @@ -0,0 +1,44 @@ +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// Helper program for triggering fast page faults after UFFD snapshot restore. +// Allocates a 128M memory area using mmap, touches every page in it using memset and then +// calls `sigwait` to wait for a SIGUSR1 signal. Upon receiving this signal, +// set the entire memory area to 1, to trigger fast page fault. +// The idea is that an integration test takes a snapshot while the process is +// waiting for the SIGUSR1 signal, and then sends the signal after restoring. +// This way, the `memset` will trigger a fast page fault for every page in +// the memory region. + +#include // perror +#include // sigwait and friends +#include // memset +#include // mmap + +#define MEM_SIZE_MIB (128 * 1024 * 1024) + +int main(int argc, char *const argv[]) { + sigset_t set; + int signal; + + sigemptyset(&set); + if(sigaddset(&set, SIGUSR1) == -1) { + perror("sigaddset"); + return -1; + } + + void *ptr = mmap(NULL, MEM_SIZE_MIB, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + memset(ptr, 1, MEM_SIZE_MIB); + + if(MAP_FAILED == ptr) { + perror("mmap"); + return -1; + } + + sigwait(&set, &signal); + + memset(ptr, 2, MEM_SIZE_MIB); + + return 0; +} \ No newline at end of file diff --git a/resources/rebuild.sh b/resources/rebuild.sh index 6a40812d6b0..fa2a1e9df6f 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -205,6 +205,7 @@ install_dependencies BIN=overlay/usr/local/bin compile_and_install $BIN/init.c $BIN/init compile_and_install $BIN/fillmem.c $BIN/fillmem +compile_and_install $BIN/fast_page_fault_helper.c $BIN/fast_page_fault_helper compile_and_install $BIN/readmem.c $BIN/readmem if [ $ARCH == "aarch64" ]; then compile_and_install $BIN/devmemread.c $BIN/devmemread diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 8da0d04de16..1da89fe698e 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -50,12 +50,20 @@ serde_json = "1.0.113" tracing = ["log-instrument", "seccompiler/tracing", "utils/tracing", "vmm/tracing"] [[example]] -name = "uffd_malicious_handler" -path = "examples/uffd/malicious_handler.rs" +name = "uffd_malicious_4k_handler" +path = "examples/uffd/malicious_4k_handler.rs" [[example]] -name = "uffd_valid_handler" -path = "examples/uffd/valid_handler.rs" +name = "uffd_valid_4k_handler" +path = "examples/uffd/valid_4k_handler.rs" + +[[example]] +name = "uffd_valid_2m_handler" +path = "examples/uffd/valid_2m_handler.rs" + +[[example]] +name = "uffd_fault_all_handler" +path = "examples/uffd/fault_all_handler.rs" [[example]] name = "seccomp_harmless" diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs new file mode 100644 index 00000000000..1ab22ada680 --- /dev/null +++ b/src/firecracker/examples/uffd/fault_all_handler.rs @@ -0,0 +1,50 @@ +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Provides functionality for a userspace page fault handler +//! which loads the whole region from the backing memory file +//! when a page fault occurs. + +mod uffd_utils; + +use std::fs::File; +use std::os::unix::net::UnixListener; + +use uffd_utils::{Runtime, UffdHandler}; +use utils::get_page_size; + +fn main() { + let mut args = std::env::args(); + let uffd_sock_path = args.nth(1).expect("No socket path given"); + let mem_file_path = args.next().expect("No memory file given"); + + let file = File::open(mem_file_path).expect("Cannot open memfile"); + + // Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker. + let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path"); + let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); + + // Populate a single page from backing memory file. + // This is just an example, probably, with the worst-case latency scenario, + // of how memory can be loaded in guest RAM. + let len = get_page_size().unwrap(); // page size does not matter, we fault in everything on the first fault + + let mut runtime = Runtime::new(stream, file); + runtime.run(len, |uffd_handler: &mut UffdHandler| { + // Read an event from the userfaultfd. + let event = uffd_handler + .read_event() + .expect("Failed to read uffd_msg") + .expect("uffd_msg not ready"); + + match event { + userfaultfd::Event::Pagefault { .. } => { + for region in uffd_handler.mem_regions.clone() { + uffd_handler + .serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size) + } + } + _ => panic!("Unexpected event on userfaultfd"), + } + }); +} diff --git a/src/firecracker/examples/uffd/malicious_handler.rs b/src/firecracker/examples/uffd/malicious_4k_handler.rs similarity index 95% rename from src/firecracker/examples/uffd/malicious_handler.rs rename to src/firecracker/examples/uffd/malicious_4k_handler.rs index 9af94e057aa..157d3d7e147 100644 --- a/src/firecracker/examples/uffd/malicious_handler.rs +++ b/src/firecracker/examples/uffd/malicious_4k_handler.rs @@ -23,7 +23,7 @@ fn main() { let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); let mut runtime = Runtime::new(stream, file); - runtime.run(|uffd_handler: &mut UffdHandler| { + runtime.run(4096, |uffd_handler: &mut UffdHandler| { // Read an event from the userfaultfd. let event = uffd_handler .read_event() diff --git a/src/firecracker/examples/uffd/uffd_utils.rs b/src/firecracker/examples/uffd/uffd_utils.rs index 822ce178fac..d517f785e19 100644 --- a/src/firecracker/examples/uffd/uffd_utils.rs +++ b/src/firecracker/examples/uffd/uffd_utils.rs @@ -12,7 +12,6 @@ use std::ptr; use serde::{Deserialize, Serialize}; use userfaultfd::{Error, Event, Uffd}; -use utils::get_page_size; use utils::sock_ctrl_msg::ScmSocket; // This is the same with the one used in src/vmm. @@ -33,7 +32,7 @@ pub struct GuestRegionUffdMapping { pub offset: u64, } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Copy)] pub enum MemPageState { Uninitialized, FromFile, @@ -41,21 +40,27 @@ pub enum MemPageState { Anonymous, } -#[derive(Debug)] -struct MemRegion { - mapping: GuestRegionUffdMapping, +#[derive(Debug, Clone)] +pub struct MemRegion { + pub mapping: GuestRegionUffdMapping, page_states: HashMap, } #[derive(Debug)] pub struct UffdHandler { - mem_regions: Vec, + pub mem_regions: Vec, + page_size: usize, backing_buffer: *const u8, uffd: Uffd, } impl UffdHandler { - pub fn from_unix_stream(stream: &UnixStream, backing_buffer: *const u8, size: usize) -> Self { + pub fn from_unix_stream( + stream: &UnixStream, + page_size: usize, + backing_buffer: *const u8, + size: usize, + ) -> Self { let mut message_buf = vec![0u8; 1024]; let (bytes_read, file) = stream .recv_with_fd(&mut message_buf[..]) @@ -71,13 +76,15 @@ impl UffdHandler { // Make sure memory size matches backing data size. assert_eq!(memsize, size); + assert!(page_size.is_power_of_two()); let uffd = unsafe { Uffd::from_raw_fd(file.into_raw_fd()) }; - let mem_regions = create_mem_regions(&mappings); + let mem_regions = create_mem_regions(&mappings, page_size); Self { mem_regions, + page_size, backing_buffer, uffd, } @@ -87,21 +94,19 @@ impl UffdHandler { self.uffd.read_event() } - pub fn update_mem_state_mappings(&mut self, start: u64, end: u64, state: &MemPageState) { + pub fn update_mem_state_mappings(&mut self, start: u64, end: u64, state: MemPageState) { for region in self.mem_regions.iter_mut() { for (key, value) in region.page_states.iter_mut() { if key >= &start && key < &end { - *value = state.clone(); + *value = state; } } } } pub fn serve_pf(&mut self, addr: *mut u8, len: usize) { - let page_size = get_page_size().unwrap(); - // Find the start of the page that the current faulting address belongs to. - let dst = (addr as usize & !(page_size as usize - 1)) as *mut libc::c_void; + let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void; let fault_page_addr = dst as u64; // Get the state of the current faulting page. @@ -117,12 +122,12 @@ impl UffdHandler { // memory from the host (through balloon device) Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => { let (start, end) = self.populate_from_file(region, fault_page_addr, len); - self.update_mem_state_mappings(start, end, &MemPageState::FromFile); + self.update_mem_state_mappings(start, end, MemPageState::FromFile); return; } Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => { let (start, end) = self.zero_out(fault_page_addr); - self.update_mem_state_mappings(start, end, &MemPageState::Anonymous); + self.update_mem_state_mappings(start, end, MemPageState::Anonymous); return; } None => {} @@ -152,17 +157,15 @@ impl UffdHandler { } fn zero_out(&mut self, addr: u64) -> (u64, u64) { - let page_size = get_page_size().unwrap(); - let ret = unsafe { self.uffd - .zeropage(addr as *mut _, page_size, true) + .zeropage(addr as *mut _, self.page_size, true) .expect("Uffd zeropage failed") }; // Make sure the UFFD zeroed out some bytes. assert!(ret > 0); - (addr, addr + page_size as u64) + (addr, addr + self.page_size as u64) } } @@ -211,7 +214,7 @@ impl Runtime { /// When uffd is polled, page fault is handled by /// calling `pf_event_dispatch` with corresponding /// uffd object passed in. - pub fn run(&mut self, pf_event_dispatch: impl Fn(&mut UffdHandler)) { + pub fn run(&mut self, page_size: usize, pf_event_dispatch: impl Fn(&mut UffdHandler)) { let mut pollfds = vec![]; // Poll the stream for incoming uffds @@ -246,6 +249,7 @@ impl Runtime { // Handle new uffd from stream let handler = UffdHandler::from_unix_stream( &self.stream, + page_size, self.backing_memory, self.backing_memory_size, ); @@ -270,8 +274,7 @@ impl Runtime { } } -fn create_mem_regions(mappings: &Vec) -> Vec { - let page_size = get_page_size().unwrap(); +fn create_mem_regions(mappings: &Vec, page_size: usize) -> Vec { let mut mem_regions: Vec = Vec::with_capacity(mappings.len()); for r in mappings.iter() { @@ -314,7 +317,7 @@ mod tests { let mut uninit_runtime = Box::new(MaybeUninit::::uninit()); // We will use this pointer to bypass a bunch of Rust Safety // for the sake of convenience. - let runtime_ptr = uninit_runtime.as_ptr() as *const Runtime; + let runtime_ptr = uninit_runtime.as_ptr().cast::(); let runtime_thread = std::thread::spawn(move || { let tmp_file = TempFile::new().unwrap(); @@ -327,7 +330,7 @@ mod tests { let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); // Update runtime with actual runtime let runtime = uninit_runtime.write(Runtime::new(stream, file)); - runtime.run(|_: &mut UffdHandler| {}); + runtime.run(4096, |_: &mut UffdHandler| {}); }); // wait for runtime thread to initialize itself diff --git a/src/firecracker/examples/uffd/valid_2m_handler.rs b/src/firecracker/examples/uffd/valid_2m_handler.rs new file mode 100644 index 00000000000..d824ca01f55 --- /dev/null +++ b/src/firecracker/examples/uffd/valid_2m_handler.rs @@ -0,0 +1,51 @@ +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Provides functionality for a userspace page fault handler +//! which loads the whole region from the backing memory file +//! when a page fault occurs. + +mod uffd_utils; + +use std::fs::File; +use std::os::unix::net::UnixListener; + +use uffd_utils::{MemPageState, Runtime, UffdHandler}; + +fn main() { + let mut args = std::env::args(); + let uffd_sock_path = args.nth(1).expect("No socket path given"); + let mem_file_path = args.next().expect("No memory file given"); + + let file = File::open(mem_file_path).expect("Cannot open memfile"); + + // Get Uffd from UDS. We'll use the uffd to handle PFs for Firecracker. + let listener = UnixListener::bind(uffd_sock_path).expect("Cannot bind to socket path"); + let (stream, _) = listener.accept().expect("Cannot listen on UDS socket"); + + // Populate a single page from backing memory file. + // This is just an example, probably, with the worst-case latency scenario, + // of how memory can be loaded in guest RAM. + let len = 2 * 1024 * 1024; + + let mut runtime = Runtime::new(stream, file); + runtime.run(len, |uffd_handler: &mut UffdHandler| { + // Read an event from the userfaultfd. + let event = uffd_handler + .read_event() + .expect("Failed to read uffd_msg") + .expect("uffd_msg not ready"); + + // We expect to receive either a Page Fault or Removed + // event (if the balloon device is enabled). + match event { + userfaultfd::Event::Pagefault { addr, .. } => uffd_handler.serve_pf(addr.cast(), len), + userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings( + start as u64, + end as u64, + MemPageState::Removed, + ), + _ => panic!("Unexpected event on userfaultfd"), + } + }); +} diff --git a/src/firecracker/examples/uffd/valid_handler.rs b/src/firecracker/examples/uffd/valid_4k_handler.rs similarity index 95% rename from src/firecracker/examples/uffd/valid_handler.rs rename to src/firecracker/examples/uffd/valid_4k_handler.rs index 609380afa8a..1f752f141f1 100644 --- a/src/firecracker/examples/uffd/valid_handler.rs +++ b/src/firecracker/examples/uffd/valid_4k_handler.rs @@ -30,7 +30,7 @@ fn main() { let len = get_page_size().unwrap(); let mut runtime = Runtime::new(stream, file); - runtime.run(|uffd_handler: &mut UffdHandler| { + runtime.run(len, |uffd_handler: &mut UffdHandler| { // Read an event from the userfaultfd. let event = uffd_handler .read_event() @@ -44,7 +44,7 @@ fn main() { userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings( start as u64, end as u64, - &MemPageState::Removed, + MemPageState::Removed, ), _ => panic!("Unexpected event on userfaultfd"), } diff --git a/src/firecracker/src/api_server/request/machine_configuration.rs b/src/firecracker/src/api_server/request/machine_configuration.rs index eeb8216a523..746b1e19009 100644 --- a/src/firecracker/src/api_server/request/machine_configuration.rs +++ b/src/firecracker/src/api_server/request/machine_configuration.rs @@ -74,6 +74,7 @@ pub(crate) fn parse_patch_machine_config(body: &Body) -> Result GuestMemoryMmap { - GuestMemoryMmap::from_raw_regions(&[(GuestAddress(0), MEM_LEN)], true).unwrap() + GuestMemoryMmap::from_raw_regions(&[(GuestAddress(0), MEM_LEN)], true, HugePageConfig::None) + .unwrap() } fn check_dirty_mem(mem: &GuestMemoryMmap, addr: GuestAddress, len: u32) { diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index a1c432f72c8..0a3c912f8ce 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -32,7 +32,7 @@ use crate::resources::VmResources; use crate::snapshot::Snapshot; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; -use crate::vmm_config::machine_config::{MachineConfigUpdate, VmConfigError}; +use crate::vmm_config::machine_config::{HugePageConfig, MachineConfigUpdate, VmConfigError}; use crate::vmm_config::snapshot::{ CreateSnapshotParams, LoadSnapshotParams, MemBackendType, SnapshotType, }; @@ -54,6 +54,8 @@ pub struct VmInfo { pub cpu_template: StaticCpuTemplate, /// Boot source information. pub boot_source: BootSourceConfig, + /// Huge page configuration + pub huge_pages: HugePageConfig, } impl From<&VmResources> for VmInfo { @@ -63,6 +65,7 @@ impl From<&VmResources> for VmInfo { smt: value.vm_config.smt, cpu_template: StaticCpuTemplate::from(&value.vm_config.cpu_template), boot_source: value.boot_source_config().clone(), + huge_pages: value.vm_config.huge_pages, } } } @@ -399,6 +402,7 @@ pub fn restore_from_snapshot( smt: Some(microvm_state.vm_info.smt), cpu_template: Some(microvm_state.vm_info.cpu_template), track_dirty_pages: Some(track_dirty_pages), + huge_pages: Some(microvm_state.vm_info.huge_pages), }) .map_err(BuildMicrovmFromSnapshotError::VmUpdateConfig)?; @@ -410,8 +414,13 @@ pub fn restore_from_snapshot( let (guest_memory, uffd) = match params.mem_backend.backend_type { MemBackendType::File => ( - guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages) - .map_err(RestoreFromSnapshotGuestMemoryError::File)?, + guest_memory_from_file( + mem_backend_path, + mem_state, + track_dirty_pages, + vm_resources.vm_config.huge_pages, + ) + .map_err(RestoreFromSnapshotGuestMemoryError::File)?, None, ), MemBackendType::Uffd => guest_memory_from_uffd( @@ -421,6 +430,7 @@ pub fn restore_from_snapshot( // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device // is present in the microVM state. microvm_state.device_states.balloon_device.is_some(), + vm_resources.vm_config.huge_pages, ) .map_err(RestoreFromSnapshotGuestMemoryError::Uffd)?, }; @@ -474,9 +484,11 @@ fn guest_memory_from_file( mem_file_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, + huge_pages: HugePageConfig, ) -> Result { let mem_file = File::open(mem_file_path)?; - let guest_mem = GuestMemoryMmap::from_state(Some(&mem_file), mem_state, track_dirty_pages)?; + let guest_mem = + GuestMemoryMmap::from_state(Some(&mem_file), mem_state, track_dirty_pages, huge_pages)?; Ok(guest_mem) } @@ -500,8 +512,9 @@ fn guest_memory_from_uffd( mem_state: &GuestMemoryState, track_dirty_pages: bool, enable_balloon: bool, + huge_pages: HugePageConfig, ) -> Result<(GuestMemoryMmap, Option), GuestMemoryFromUffdError> { - let guest_memory = GuestMemoryMmap::from_state(None, mem_state, track_dirty_pages)?; + let guest_memory = GuestMemoryMmap::from_state(None, mem_state, track_dirty_pages, huge_pages)?; let mut uffd_builder = UffdBuilder::new(); diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 6d8b86eeb06..93b4b24822e 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -10,7 +10,7 @@ use utils::net::ipv4addr::is_link_local_valid; use crate::cpu_config::templates::CustomCpuTemplate; use crate::device_manager::persist::SharedDeviceType; -use crate::logger::info; +use crate::logger::{info, log_dev_preview_warning}; use crate::mmds; use crate::mmds::data_store::{Mmds, MmdsVersion}; use crate::mmds::ns::MmdsNetworkStack; @@ -22,7 +22,7 @@ use crate::vmm_config::drive::*; use crate::vmm_config::entropy::*; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::{ - MachineConfig, MachineConfigUpdate, VmConfig, VmConfigError, + HugePageConfig, MachineConfig, MachineConfigUpdate, VmConfig, VmConfigError, }; use crate::vmm_config::metrics::{init_metrics, MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; @@ -238,6 +238,10 @@ impl VmResources { /// Updates the configuration of the microVM. pub fn update_vm_config(&mut self, update: &MachineConfigUpdate) -> Result<(), VmConfigError> { + if update.huge_pages.is_some() && update.huge_pages != Some(HugePageConfig::None) { + log_dev_preview_warning("Huge pages support", None); + } + let updated = self.vm_config.update(update)?; // The VM cannot have a memory size smaller than the target size @@ -253,6 +257,16 @@ impl VmResources { return Err(VmConfigError::IncompatibleBalloonSize); } + if self.balloon.get().is_some() && updated.huge_pages != HugePageConfig::None { + return Err(VmConfigError::BalloonAndHugePages); + } + + if self.boot_source.config.initrd_path.is_some() + && updated.huge_pages != HugePageConfig::None + { + return Err(VmConfigError::InitrdAndHugePages); + } + self.vm_config = updated; Ok(()) @@ -317,6 +331,10 @@ impl VmResources { return Err(BalloonConfigError::TooManyPagesRequested); } + if self.vm_config.huge_pages != HugePageConfig::None { + return Err(BalloonConfigError::HugePages); + } + self.balloon.set(config) } @@ -325,6 +343,12 @@ impl VmResources { &mut self, boot_source_cfg: BootSourceConfig, ) -> Result<(), BootSourceConfigError> { + if boot_source_cfg.initrd_path.is_some() + && self.vm_config.huge_pages != HugePageConfig::None + { + return Err(BootSourceConfigError::HugePagesAndInitRd); + } + self.set_boot_source_config(boot_source_cfg); self.boot_source.builder = Some(BootConfig::new(self.boot_source_config())?); Ok(()) @@ -468,6 +492,7 @@ mod tests { use std::str::FromStr; use serde_json::{Map, Value}; + use utils::kernel_version::KernelVersion; use utils::net::mac::MacAddr; use utils::tempfile::TempFile; @@ -481,7 +506,7 @@ mod tests { BootConfig, BootSource, BootSourceConfig, DEFAULT_KERNEL_CMDLINE, }; use crate::vmm_config::drive::{BlockBuilder, BlockDeviceConfig}; - use crate::vmm_config::machine_config::{MachineConfig, VmConfigError}; + use crate::vmm_config::machine_config::{HugePageConfig, MachineConfig, VmConfigError}; use crate::vmm_config::net::{NetBuilder, NetworkInterfaceConfig}; use crate::vmm_config::vsock::tests::default_config; use crate::vmm_config::RateLimiterConfig; @@ -1296,6 +1321,7 @@ mod tests { #[cfg(target_arch = "aarch64")] cpu_template: Some(StaticCpuTemplate::V1N1), track_dirty_pages: Some(false), + huge_pages: Some(HugePageConfig::None), }; assert_ne!( @@ -1364,6 +1390,23 @@ mod tests { // mem_size_mib compatible with balloon size. aux_vm_config.mem_size_mib = Some(256); vm_resources.update_vm_config(&aux_vm_config).unwrap(); + + // mem_size_mib incompatible with huge pages configuration + aux_vm_config.mem_size_mib = Some(129); + aux_vm_config.huge_pages = Some(HugePageConfig::Hugetlbfs2M); + assert_eq!( + vm_resources.update_vm_config(&aux_vm_config).unwrap_err(), + VmConfigError::InvalidMemorySize + ); + + if KernelVersion::get().unwrap() >= KernelVersion::new(5, 10, 0) { + // mem_size_mib compatible with huge page configuration + aux_vm_config.mem_size_mib = Some(2048); + // Remove the balloon device config that's added by `default_vm_resources` as it would + // trigger the "ballooning incompatible with huge pages" check. + vm_resources.balloon = BalloonBuilder::new(); + vm_resources.update_vm_config(&aux_vm_config).unwrap(); + } } #[test] diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 62a0fc10991..b8a3929f42b 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -1069,6 +1069,7 @@ mod tests { smt: value.vm_config.smt, cpu_template: StaticCpuTemplate::from(&value.vm_config.cpu_template), boot_source: value.boot_source_config().clone(), + huge_pages: value.vm_config.huge_pages, } } } diff --git a/src/vmm/src/utilities/test_utils/mod.rs b/src/vmm/src/utilities/test_utils/mod.rs index d15b9a85039..f46229b0566 100644 --- a/src/vmm/src/utilities/test_utils/mod.rs +++ b/src/vmm/src/utilities/test_utils/mod.rs @@ -13,6 +13,7 @@ use crate::seccomp_filters::get_empty_filters; use crate::utilities::mock_resources::{MockBootSourceConfig, MockVmConfig, MockVmResources}; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; +use crate::vmm_config::machine_config::HugePageConfig; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryMmap}; use crate::{EventManager, Vmm}; @@ -30,7 +31,8 @@ pub fn single_region_mem_at(at: u64, size: usize) -> GuestMemoryMmap { /// Creates a [`GuestMemoryMmap`] with multiple regions and without dirty page tracking. pub fn multi_region_mem(regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { - GuestMemoryMmap::from_raw_regions(regions, false).expect("Cannot initialize memory") + GuestMemoryMmap::from_raw_regions(regions, false, HugePageConfig::None) + .expect("Cannot initialize memory") } /// Creates a [`GuestMemoryMmap`] of the given size with the contained regions laid out in diff --git a/src/vmm/src/vmm_config/balloon.rs b/src/vmm/src/vmm_config/balloon.rs index d359c871ece..4b4e229b9ec 100644 --- a/src/vmm/src/vmm_config/balloon.rs +++ b/src/vmm/src/vmm_config/balloon.rs @@ -28,6 +28,8 @@ pub enum BalloonConfigError { CreateFailure(crate::devices::virtio::balloon::BalloonError), /// Error updating the balloon device configuration: {0:?} UpdateFailure(std::io::Error), + /// Firecracker's huge pages support is incompatible with memory ballooning. + HugePages, } /// This struct represents the strongly typed equivalent of the json body diff --git a/src/vmm/src/vmm_config/boot_source.rs b/src/vmm/src/vmm_config/boot_source.rs index 24869f1be91..8374ae335a8 100644 --- a/src/vmm/src/vmm_config/boot_source.rs +++ b/src/vmm/src/vmm_config/boot_source.rs @@ -42,6 +42,8 @@ pub enum BootSourceConfigError { InvalidInitrdPath(io::Error), /// The kernel command line is invalid: {0} InvalidKernelCommandLine(String), + /// Firecracker's huge pages support is incompatible with initrds. + HugePagesAndInitRd, } /// Holds the kernel specification (both configuration as well as runtime details). diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index d8df1e8d9ad..b012cb2c2c5 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -3,6 +3,8 @@ use std::fmt::Debug; use serde::{Deserialize, Serialize}; +use utils::kernel_version; +use utils::kernel_version::KernelVersion; use crate::cpu_config::templates::{CpuTemplateType, CustomCpuTemplate, StaticCpuTemplate}; @@ -18,7 +20,7 @@ pub const MAX_SUPPORTED_VCPUS: u8 = 32; pub enum VmConfigError { /// The memory size (MiB) is smaller than the previously set balloon device target size. IncompatibleBalloonSize, - /// The memory size (MiB) is invalid. + /// The memory size (MiB) is either 0, or not a multiple of the configured page size. InvalidMemorySize, /// The number of vCPUs must be greater than 0, less than {MAX_SUPPORTED_VCPUS:} and must be 1 or an even number if SMT is enabled. InvalidVcpuCount, @@ -27,6 +29,70 @@ pub enum VmConfigError { /// Enabling simultaneous multithreading is not supported on aarch64. #[cfg(target_arch = "aarch64")] SmtNotSupported, + /// Could not determine host kernel version when checking hugetlbfs compatibility + KernelVersion, + /// Firecracker's hugetlbfs support requires at least host kernel 5.10. + HugetlbfsNotSupported, + /// Firecracker's huge pages support is incompatible with memory ballooning. + BalloonAndHugePages, + /// Firecracker's huge pages support is incompatible with initrds. + InitrdAndHugePages, +} + +// We cannot do a `KernelVersion(kernel_version::Error)` variant because `kernel_version::Error` +// does not implement `PartialEq, Eq` (due to containing an io error). +impl From for VmConfigError { + fn from(_: kernel_version::Error) -> Self { + VmConfigError::KernelVersion + } +} + +/// Describes the possible (huge)page configurations for a microVM's memory. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub enum HugePageConfig { + /// Do not use hugepages, e.g. back guest memory by 4K + #[default] + None, + /// Back guest memory by 2MB hugetlbfs pages + #[serde(rename = "2M")] + Hugetlbfs2M, +} + +impl HugePageConfig { + /// Checks whether the given memory size (in MiB) is valid for this [`HugePageConfig`], e.g. + /// whether it is a multiple of the page size + fn is_valid_mem_size(&self, mem_size_mib: usize) -> bool { + let divisor = match self { + // Any integer memory size expressed in MiB will be a multiple of 4096KiB. + HugePageConfig::None => 1, + HugePageConfig::Hugetlbfs2M => 2, + }; + + mem_size_mib % divisor == 0 + } + + /// Returns the flags required to pass to `mmap`, in addition to `MAP_ANONYMOUS`, to + /// create a mapping backed by huge pages as described by this [`HugePageConfig`]. + pub fn mmap_flags(&self) -> libc::c_int { + match self { + HugePageConfig::None => 0, + HugePageConfig::Hugetlbfs2M => libc::MAP_HUGETLB | libc::MAP_HUGE_2MB, + } + } + + /// Returns `true` iff this [`HugePageConfig`] describes a hugetlbfs-based configuration. + pub fn is_hugetlbfs(&self) -> bool { + matches!(self, HugePageConfig::Hugetlbfs2M) + } +} + +impl From for Option { + fn from(value: HugePageConfig) -> Self { + match value { + HugePageConfig::None => None, + HugePageConfig::Hugetlbfs2M => Some(memfd::HugetlbSize::Huge2MB), + } + } } /// Struct used in PUT `/machine-config` API call. @@ -46,6 +112,9 @@ pub struct MachineConfig { /// Enables or disables dirty page tracking. Enabling allows incremental snapshots. #[serde(default)] pub track_dirty_pages: bool, + /// Configures what page size Firecracker should use to back guest memory. + #[serde(default)] + pub huge_pages: HugePageConfig, } impl Default for MachineConfig { @@ -78,6 +147,9 @@ pub struct MachineConfigUpdate { /// Enables or disables dirty page tracking. Enabling allows incremental snapshots. #[serde(skip_serializing_if = "Option::is_none")] pub track_dirty_pages: Option, + /// Configures what page size Firecracker should use to back guest memory. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub huge_pages: Option, } impl MachineConfigUpdate { @@ -97,6 +169,7 @@ impl From for MachineConfigUpdate { smt: Some(cfg.smt), cpu_template: cfg.cpu_template, track_dirty_pages: Some(cfg.track_dirty_pages), + huge_pages: Some(cfg.huge_pages), } } } @@ -114,6 +187,8 @@ pub struct VmConfig { pub cpu_template: Option, /// Enables or disables dirty page tracking. Enabling allows incremental snapshots. pub track_dirty_pages: bool, + /// Configures what page size Firecracker should use to back guest memory. + pub huge_pages: HugePageConfig, } impl VmConfig { @@ -148,8 +223,9 @@ impl VmConfig { } let mem_size_mib = update.mem_size_mib.unwrap_or(self.mem_size_mib); + let page_config = update.huge_pages.unwrap_or(self.huge_pages); - if mem_size_mib == 0 { + if mem_size_mib == 0 || !page_config.is_valid_mem_size(mem_size_mib) { return Err(VmConfigError::InvalidMemorySize); } @@ -159,12 +235,17 @@ impl VmConfig { Some(other) => Some(CpuTemplateType::Static(other)), }; + if page_config.is_hugetlbfs() && KernelVersion::get()? < KernelVersion::new(4, 16, 0) { + return Err(VmConfigError::HugetlbfsNotSupported); + } + Ok(VmConfig { vcpu_count, mem_size_mib, smt, cpu_template, track_dirty_pages: update.track_dirty_pages.unwrap_or(self.track_dirty_pages), + huge_pages: page_config, }) } } @@ -177,6 +258,7 @@ impl Default for VmConfig { smt: false, cpu_template: None, track_dirty_pages: false, + huge_pages: HugePageConfig::None, } } } @@ -189,6 +271,31 @@ impl From<&VmConfig> for MachineConfig { smt: value.smt, cpu_template: value.cpu_template.as_ref().map(|template| template.into()), track_dirty_pages: value.track_dirty_pages, + huge_pages: value.huge_pages, + } + } +} + +#[cfg(test)] +mod tests { + use utils::kernel_version::KernelVersion; + + use crate::vmm_config::machine_config::{ + HugePageConfig, MachineConfigUpdate, VmConfig, VmConfigError, + }; + + #[test] + fn test_hugetlbfs_not_supported_4_14() { + if KernelVersion::get().unwrap() < KernelVersion::new(4, 16, 0) { + let base_config = VmConfig::default(); + let update = MachineConfigUpdate { + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + mem_size_mib: Some(1024), + ..Default::default() + }; + + let err = base_config.update(&update).unwrap_err(); + assert_eq!(err, VmConfigError::HugetlbfsNotSupported) } } } diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index b2dca00b1b7..1dd89f9b104 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -19,6 +19,7 @@ pub use vm_memory::{ }; use vm_memory::{Error as VmMemoryError, GuestMemoryError, WriteVolatile}; +use crate::vmm_config::machine_config::HugePageConfig; use crate::DirtyBitmap; /// Type of GuestMemoryMmap. @@ -49,6 +50,8 @@ pub enum MemoryError { Memfd(memfd::Error), /// Cannot resize memfd file: {0:?} MemfdSetLen(std::io::Error), + /// Cannot restore hugetlbfs backed snapshot by mapping the memory file. Please use uffd. + HugetlbfsSnapshot, } /// Defines the interface for snapshotting memory. @@ -57,12 +60,17 @@ where Self: Sized, { /// Creates a GuestMemoryMmap with `size` in MiB backed by a memfd. - fn memfd_backed(mem_size_mib: usize, track_dirty_pages: bool) -> Result; + fn memfd_backed( + mem_size_mib: usize, + track_dirty_pages: bool, + huge_pages: HugePageConfig, + ) -> Result; /// Creates a GuestMemoryMmap from raw regions. fn from_raw_regions( regions: &[(GuestAddress, usize)], track_dirty_pages: bool, + huge_pages: HugePageConfig, ) -> Result; /// Creates a GuestMemoryMmap from raw regions. @@ -78,6 +86,7 @@ where file: Option<&File>, state: &GuestMemoryState, track_dirty_pages: bool, + huge_pages: HugePageConfig, ) -> Result; /// Describes GuestMemoryMmap through a GuestMemoryState struct. @@ -119,8 +128,12 @@ pub struct GuestMemoryState { impl GuestMemoryExtension for GuestMemoryMmap { /// Creates a GuestMemoryMmap with `size` in MiB backed by a memfd. - fn memfd_backed(mem_size_mib: usize, track_dirty_pages: bool) -> Result { - let memfd_file = create_memfd(mem_size_mib)?.into_file(); + fn memfd_backed( + mem_size_mib: usize, + track_dirty_pages: bool, + huge_pages: HugePageConfig, + ) -> Result { + let memfd_file = create_memfd(mem_size_mib, huge_pages.into())?.into_file(); let mut offset: u64 = 0; let regions = crate::arch::arch_memory_regions(mem_size_mib << 20) @@ -140,9 +153,16 @@ impl GuestMemoryExtension for GuestMemoryMmap { fn from_raw_regions( regions: &[(GuestAddress, usize)], track_dirty_pages: bool, + huge_pages: HugePageConfig, ) -> Result { let prot = libc::PROT_READ | libc::PROT_WRITE; - let flags = libc::MAP_NORESERVE | libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; + // MAP_NORESERVE for 4K-backed page regions means that no swap space will be reserved for + // the region. For hugetlbfs regions, it means that pages in the hugetlbfs pool will + // not be reserved at mmap-time. This means that instead of failing at mmap-time if + // the hugetlbfs page pool is too small to accommodate the entire VM, Firecracker might + // receive a SIGBUS if a pagefault ever cannot be served due to the pool being depleted. + let flags = + libc::MAP_NORESERVE | libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | huge_pages.mmap_flags(); let regions = regions .iter() @@ -156,6 +176,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { .with_mmap_flags(flags) .build() .map_err(MemoryError::MmapRegionError)?; + GuestRegionMmap::new(region, *guest_address).map_err(MemoryError::VmMemoryError) }) .collect::, MemoryError>>()?; @@ -188,6 +209,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { .with_file_offset(file_offset) .build() .map_err(MemoryError::MmapRegionError)?; + GuestRegionMmap::new(region, guest_address).map_err(MemoryError::VmMemoryError) }) .collect::, MemoryError>>()?; @@ -201,9 +223,14 @@ impl GuestMemoryExtension for GuestMemoryMmap { file: Option<&File>, state: &GuestMemoryState, track_dirty_pages: bool, + huge_pages: HugePageConfig, ) -> Result { match file { Some(f) => { + if huge_pages.is_hugetlbfs() { + return Err(MemoryError::HugetlbfsSnapshot); + } + let regions = state .regions .iter() @@ -224,7 +251,7 @@ impl GuestMemoryExtension for GuestMemoryMmap { .iter() .map(|r| (GuestAddress(r.base_address), r.size)) .collect::>(); - Self::from_raw_regions(®ions, track_dirty_pages) + Self::from_raw_regions(®ions, track_dirty_pages, huge_pages) } } } @@ -324,11 +351,15 @@ impl GuestMemoryExtension for GuestMemoryMmap { } } -/// Creates a memfd file with the `size` in MiB. -fn create_memfd(size: usize) -> Result { +fn create_memfd( + size: usize, + hugetlb_size: Option, +) -> Result { let mem_size = size << 20; // Create a memfd. - let opts = memfd::MemfdOptions::default().allow_sealing(true); + let opts = memfd::MemfdOptions::default() + .hugetlb(hugetlb_size) + .allow_sealing(true); let mem_file = opts.create("guest_mem").map_err(MemoryError::Memfd)?; // Resize to guest mem size. @@ -376,7 +407,8 @@ mod tests { (GuestAddress(0x30000), region_size), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(®ions, false).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(®ions, false, HugePageConfig::None).unwrap(); guest_memory.iter().for_each(|region| { assert!(region.bitmap().is_none()); }); @@ -392,7 +424,8 @@ mod tests { (GuestAddress(0x30000), region_size), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(®ions, true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(®ions, true, HugePageConfig::None).unwrap(); guest_memory.iter().for_each(|region| { assert!(region.bitmap().is_some()); }); @@ -460,6 +493,25 @@ mod tests { } } + #[test] + fn test_from_state() { + let state = GuestMemoryState { + regions: vec![GuestMemoryRegionState { + base_address: 0, + size: 4096, + offset: 0, + }], + }; + let file = TempFile::new().unwrap().into_file(); + + // No mapping of snapshots that were taken with hugetlbfs enabled + let err = + GuestMemoryMmap::from_state(Some(&file), &state, false, HugePageConfig::Hugetlbfs2M) + .unwrap_err(); + + assert!(matches!(err, MemoryError::HugetlbfsSnapshot), "{:?}", err); + } + #[test] fn test_mark_dirty() { let page_size = get_page_size().unwrap(); @@ -470,7 +522,8 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(®ions, true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(®ions, true, HugePageConfig::None).unwrap(); let dirty_map = [ // page 0: not dirty @@ -525,8 +578,12 @@ mod tests { let region_size = page_size * 3; // Test with a single region - let guest_memory = - GuestMemoryMmap::from_raw_regions(&[(GuestAddress(0), region_size)], false).unwrap(); + let guest_memory = GuestMemoryMmap::from_raw_regions( + &[(GuestAddress(0), region_size)], + false, + HugePageConfig::None, + ) + .unwrap(); check_serde(&guest_memory); // Test with some regions @@ -535,7 +592,8 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(®ions, true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(®ions, true, HugePageConfig::None).unwrap(); check_serde(&guest_memory); } @@ -548,7 +606,9 @@ mod tests { (GuestAddress(0), page_size), (GuestAddress(page_size as u64 * 2), page_size), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(&mem_regions[..], true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(&mem_regions[..], true, HugePageConfig::None) + .unwrap(); let expected_memory_state = GuestMemoryState { regions: vec![ @@ -573,7 +633,9 @@ mod tests { (GuestAddress(0), page_size * 3), (GuestAddress(page_size as u64 * 4), page_size * 3), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(&mem_regions[..], true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(&mem_regions[..], true, HugePageConfig::None) + .unwrap(); let expected_memory_state = GuestMemoryState { regions: vec![ @@ -606,7 +668,8 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(&mem_regions, true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(&mem_regions, true, HugePageConfig::None).unwrap(); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -628,8 +691,13 @@ mod tests { let mut memory_file = TempFile::new().unwrap().into_file(); guest_memory.dump(&mut memory_file).unwrap(); - let restored_guest_memory = - GuestMemoryMmap::from_state(Some(&memory_file), &memory_state, false).unwrap(); + let restored_guest_memory = GuestMemoryMmap::from_state( + Some(&memory_file), + &memory_state, + false, + HugePageConfig::None, + ) + .unwrap(); // Check that the region contents are the same. let mut restored_region = vec![0u8; page_size * 2]; @@ -656,7 +724,8 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_raw_regions(&mem_regions, true).unwrap(); + let guest_memory = + GuestMemoryMmap::from_raw_regions(&mem_regions, true, HugePageConfig::None).unwrap(); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -686,7 +755,8 @@ mod tests { // We can restore from this because this is the first dirty dump. let restored_guest_memory = - GuestMemoryMmap::from_state(Some(&file), &memory_state, false).unwrap(); + GuestMemoryMmap::from_state(Some(&file), &memory_state, false, HugePageConfig::None) + .unwrap(); // Check that the region contents are the same. let mut restored_region = vec![0u8; region_size]; @@ -735,7 +805,7 @@ mod tests { let size = 1; let size_mb = 1 << 20; - let memfd = create_memfd(size).unwrap(); + let memfd = create_memfd(size, None).unwrap(); assert_eq!(memfd.as_file().metadata().unwrap().len(), size_mb); memfd.as_file().set_len(0x69).unwrap_err(); diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 6e7436893a0..8fb7395ab00 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -16,6 +16,7 @@ use vmm::utilities::mock_resources::{MockVmResources, NOISY_KERNEL_IMAGE}; use vmm::utilities::test_utils::dirty_tracking_vmm; use vmm::utilities::test_utils::{create_vmm, default_vmm, default_vmm_no_boot}; use vmm::vmm_config::instance_info::{InstanceInfo, VmState}; +use vmm::vmm_config::machine_config::HugePageConfig; use vmm::vmm_config::snapshot::{CreateSnapshotParams, SnapshotType}; use vmm::{DumpCpuConfigError, EventManager, FcExitCode}; @@ -236,6 +237,7 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { Some(memory_file.as_file()), µvm_state.memory_state, false, + HugePageConfig::None, ) .unwrap(); diff --git a/tests/conftest.py b/tests/conftest.py index 1732bde4ab6..0149ceecf66 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -229,7 +229,7 @@ def uffd_handler_paths(): """Build UFFD handler binaries.""" handlers = { f"{handler}_handler": build_tools.get_example(f"uffd_{handler}_handler") - for handler in ["malicious", "valid"] + for handler in ["malicious_4k", "valid_4k", "valid_2m", "fault_all"] } yield handlers diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index af2f9a2a5f3..3b5ae14fc70 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -138,6 +138,13 @@ def delete(self): self.vmstate.unlink() +class HugePagesConfig(str, Enum): + """Enum describing the huge pages configurations supported Firecracker""" + + NONE = "None" + HUGETLBFS_2MB = "2M" + + # pylint: disable=R0904 class Microvm: """Class to represent a Firecracker microvm. @@ -631,6 +638,7 @@ def basic_config( boot_args: str = None, use_initrd: bool = False, track_dirty_pages: bool = False, + huge_pages: HugePagesConfig = None, rootfs_io_engine=None, cpu_template: Optional[str] = None, enable_entropy_device=False, @@ -658,6 +666,7 @@ def basic_config( mem_size_mib=mem_size_mib, track_dirty_pages=track_dirty_pages, cpu_template=cpu_template, + huge_pages=huge_pages, ) self.vcpus_count = vcpu_count self.mem_size_bytes = mem_size_mib * 2**20 diff --git a/tests/framework/utils.py b/tests/framework/utils.py index 3e982fcbfee..1fec2ccd96a 100644 --- a/tests/framework/utils.py +++ b/tests/framework/utils.py @@ -1,7 +1,6 @@ # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 """Generic utility functions that are used in the framework.""" - import functools import glob import json diff --git a/tests/framework/utils_ftrace.py b/tests/framework/utils_ftrace.py new file mode 100644 index 00000000000..6e5d636e33d --- /dev/null +++ b/tests/framework/utils_ftrace.py @@ -0,0 +1,28 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Utilities for interacting with the kernel's ftrace subsystem""" +import contextlib + +from framework.utils import run_cmd + + +@contextlib.contextmanager +def ftrace_events(events: str = "*:*"): + """Temporarily enables the kernel's tracing functionality for the specified events + + Assumes that the caller is the only test executing on the host""" + + # We have to do system-wide tracing because inside docker we live in a pidns, but trace-cmd does not know about + # this. We don't know how to translate the pidns PID to one ftrace would understand, so we use the fact that only + # one vm is running at the same time, and thus we can attribute all KVM events to this one VM + run_cmd("mount -t tracefs nodev /sys/kernel/tracing") + run_cmd("echo > /sys/kernel/tracing/trace") # clear the trace buffers + run_cmd(f"echo {events} > /sys/kernel/tracing/set_event") + run_cmd("echo nop > /sys/kernel/tracing/current_tracer") + run_cmd("echo 1 > /sys/kernel/tracing/tracing_on") + + try: + yield + finally: + run_cmd("echo 0 > /sys/kernel/tracing/tracing_on") + run_cmd("umount /sys/kernel/tracing") diff --git a/tests/framework/vm_config.json b/tests/framework/vm_config.json index 32b4f615655..5df673308d9 100644 --- a/tests/framework/vm_config.json +++ b/tests/framework/vm_config.json @@ -21,7 +21,8 @@ "vcpu_count": 2, "mem_size_mib": 1024, "smt": false, - "track_dirty_pages": false + "track_dirty_pages": false, + "huge_pages": "None" }, "cpu-config": null, "balloon": null, diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 95086b5cd43..4c3c8c6998d 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -399,7 +399,10 @@ def test_api_machine_config(uvm_plain): # Test invalid mem_size_mib = 0. with pytest.raises( - RuntimeError, match=re.escape("The memory size (MiB) is invalid.") + RuntimeError, + match=re.escape( + "The memory size (MiB) is either 0, or not a multiple of the configured page size." + ), ): test_microvm.api.machine_config.patch(mem_size_mib=0) @@ -1105,6 +1108,7 @@ def test_get_full_config_after_restoring_snapshot(microvm_factory, uvm_nano): "mem_size_mib": 256, "smt": True, "track_dirty_pages": False, + "huge_pages": "None", } if cpu_vendor == utils_cpuid.CpuVendor.ARM: @@ -1221,6 +1225,7 @@ def test_get_full_config(uvm_plain): "mem_size_mib": 256, "smt": False, "track_dirty_pages": False, + "huge_pages": "None", } expected_cfg["cpu-config"] = None expected_cfg["boot-source"] = { diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py index 6e7e96552a8..8d70cedff46 100644 --- a/tests/integration_tests/functional/test_uffd.py +++ b/tests/integration_tests/functional/test_uffd.py @@ -110,7 +110,7 @@ def test_valid_handler(uvm_plain, snapshot, uffd_handler_paths): # Spawn page fault handler process. _pf_handler = spawn_pf_handler( - vm, uffd_handler_paths["valid_handler"], snapshot.mem + vm, uffd_handler_paths["valid_4k_handler"], snapshot.mem ) vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH) @@ -144,7 +144,7 @@ def test_malicious_handler(uvm_plain, snapshot, uffd_handler_paths): # Spawn page fault handler process. _pf_handler = spawn_pf_handler( - vm, uffd_handler_paths["malicious_handler"], snapshot.mem + vm, uffd_handler_paths["malicious_4k_handler"], snapshot.mem ) # We expect Firecracker to freeze while resuming from a snapshot diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py new file mode 100644 index 00000000000..40817ffec3e --- /dev/null +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -0,0 +1,322 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Integration tests for Firecracker's huge pages support""" +import signal +import time + +import pytest + +from framework import utils +from framework.microvm import HugePagesConfig +from framework.properties import global_props +from framework.utils_ftrace import ftrace_events +from integration_tests.functional.test_uffd import SOCKET_PATH, spawn_pf_handler + + +def check_hugetlbfs_in_use(pid: int, allocation_name: str): + """Asserts that the process with the given `pid` is using hugetlbfs pages somewhere. + + `allocation_name` should be the name of the smaps entry for which we want to verify that huge pages are used. + For memfd-backed guest memory, this would be "memfd:guest_mem" (the `guest_mem` part originating from the name + we give the memfd in memory.rs), for anonymous memory this would be "/anon_hugepage" + """ + + # Format of a sample smaps entry: + # 7fc2bc400000-7fc2cc400000 rw-s 00000000 00:10 25488401 /memfd:guest_mem (deleted) + # Size: 262144 kB + # KernelPageSize: 2048 kB + # MMUPageSize: 2048 kB + # Rss: 0 kB + # Pss: 0 kB + # Pss_Dirty: 0 kB + # Shared_Clean: 0 kB + # Shared_Dirty: 0 kB + # Private_Clean: 0 kB + # Private_Dirty: 0 kB + # Referenced: 0 kB + # Anonymous: 0 kB + # LazyFree: 0 kB + # AnonHugePages: 0 kB + # ShmemPmdMapped: 0 kB + # FilePmdMapped: 0 kB + # Shared_Hugetlb: 0 kB + # Private_Hugetlb: 92160 kB + # Swap: 0 kB + # SwapPss: 0 kB + # Locked: 0 kB + # THPeligible: 0 + # ProtectionKey: 0 + cmd = f"cat /proc/{pid}/smaps | grep {allocation_name} -A 23 | grep KernelPageSize" + _, stdout, _ = utils.run_cmd(cmd) + + kernel_page_size_kib = int(stdout.split()[1]) + assert kernel_page_size_kib > 4 + + +@pytest.mark.skipif( + global_props.host_linux_version == "4.14", + reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", +) +def test_hugetlbfs_boot(uvm_plain): + """Tests booting a microvm with guest memory backed by 2MB hugetlbfs pages""" + + uvm_plain.spawn() + uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB, mem_size_mib=128) + uvm_plain.add_net_iface() + uvm_plain.start() + + rc, _, _ = uvm_plain.ssh.run("true") + assert not rc + + check_hugetlbfs_in_use( + uvm_plain.firecracker_pid, + "memfd:guest_mem", + ) + + +@pytest.mark.skipif( + global_props.host_linux_version == "4.14", + reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", +) +def test_hugetlbfs_snapshot( + microvm_factory, guest_kernel_linux_5_10, rootfs_ubuntu_22, uffd_handler_paths +): + """ + Test hugetlbfs snapshot restore via uffd + """ + + ### Create Snapshot ### + vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs_ubuntu_22) + vm.memory_monitor = None + vm.spawn() + vm.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB, mem_size_mib=128) + vm.add_net_iface() + vm.start() + + # Wait for microvm to boot + rc, _, _ = vm.ssh.run("true") + assert not rc + + check_hugetlbfs_in_use(vm.firecracker_pid, "memfd:guest_mem") + + snapshot = vm.snapshot_full() + + vm.kill() + + ### Restore Snapshot ### + vm = microvm_factory.build() + vm.spawn() + + # Spawn page fault handler process. + _pf_handler = spawn_pf_handler( + vm, uffd_handler_paths["valid_2m_handler"], snapshot.mem + ) + + vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH) + + # Verify if guest can run commands. + rc, _, _ = vm.ssh.run("true") + assert not rc + + check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") + + +@pytest.mark.skipif( + global_props.host_linux_version == "4.14", + reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", +) +def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain, uffd_handler_paths): + """ + Test hugetlbfs differential snapshot support. + + Despite guest memory being backed by huge pages, differential snapshots still work at 4K granularity. + """ + + ### Create Snapshot ### + uvm_plain.memory_monitor = None + uvm_plain.spawn() + uvm_plain.basic_config( + huge_pages=HugePagesConfig.HUGETLBFS_2MB, + mem_size_mib=128, + track_dirty_pages=True, + ) + uvm_plain.add_net_iface() + uvm_plain.start() + + # Wait for microvm to boot + rc, _, _ = uvm_plain.ssh.run("true") + assert not rc + + base_snapshot = uvm_plain.snapshot_diff() + uvm_plain.resume() + + # Run command to dirty some pages + rc, _, _ = uvm_plain.ssh.run("sync") + assert not rc + + snapshot_diff = uvm_plain.snapshot_diff() + snapshot_merged = snapshot_diff.rebase_snapshot(base_snapshot) + + uvm_plain.kill() + + vm = microvm_factory.build() + vm.spawn() + + # Spawn page fault handler process. + _pf_handler = spawn_pf_handler( + vm, uffd_handler_paths["valid_2m_handler"], snapshot_merged.mem + ) + + vm.restore_from_snapshot(snapshot_merged, resume=True, uffd_path=SOCKET_PATH) + + # Verify if guest can run commands. + rc, _, _ = vm.ssh.run("true") + assert not rc + + +@pytest.mark.skipif( + global_props.host_linux_version == "4.14", + reason="MFD_HUGETLB | MFD_ALLOW_SEALING only supported on kernels >= 4.16", +) +@pytest.mark.parametrize("huge_pages", HugePagesConfig) +def test_ept_violation_count( + microvm_factory, + guest_kernel_linux_5_10, + rootfs_ubuntu_22, + uffd_handler_paths, + metrics, + huge_pages, +): + """ + Tests hugetlbfs snapshot restore with a UFFD handler that pre-faults the entire guest memory + on the first page fault. Records metrics about the number of EPT_VIOLATIONS encountered by KVM. + """ + + ### Create Snapshot ### + vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs_ubuntu_22) + vm.memory_monitor = None + vm.spawn() + vm.basic_config(huge_pages=huge_pages, mem_size_mib=256) + vm.add_net_iface() + vm.start() + + metrics.set_dimensions( + { + "performance_test": "test_hugetlbfs_snapshot", + "huge_pages_config": str(huge_pages), + **vm.dimensions, + } + ) + + # Wait for microvm to boot. Then spawn fast_page_fault_helper to setup an environment where we can trigger + # a lot of fast_page_faults after restoring the snapshot. + rc, _, _ = vm.ssh.run( + "nohup /usr/local/bin/fast_page_fault_helper >/dev/null 2>&1 env.list - if [[ $performance_tweaks -eq 1 ]] && [[ "$(uname --machine)" == "x86_64" ]]; then - say "Detected CI and performance tests, tuning CPU frequency scaling and idle states for reduced variability" + if [[ $performance_tweaks -eq 1 ]]; then + if [[ "$(uname --machine)" == "x86_64" ]]; then + say "Detected CI and performance tests, tuning CPU frequency scaling and idle states for reduced variability" + + apply_performance_tweaks + fi + + # It seems that even if the tests using huge pages run sequentially on ag=1 agents, right-sizing the huge pages + # pool to the total number of huge pages used across all tests results in spurious failures with pool depletion + # anyway (something else on the host seems to be stealing our huge pages, and we cannot "ear mark" them for + # Firecracker processes). Thus, just allocate 4GB of them and call it a day. + say "Setting up huge pages pool" + num_hugetlbfs_pages=2048 + + huge_pages_old=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages) + huge_pages_new=$(echo $num_hugetlbfs_pages |sudo tee /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages) + fi - apply_performance_tweaks + if [[ "$huge_pages_new" -ne "$num_hugetlbfs_pages" ]]; then + die "Failed to allocate $num_hugetlbfs_pages hugetlbfs pages, only got $huge_pages_new" fi say "Starting test run ..." @@ -727,8 +743,12 @@ cmd_test() { cmd_fix_perms # undo performance tweaks (in case the instance gets recycled for a non-perf test) - if [[ $performance_tweaks -eq 1 ]] && [[ "$(uname --machine)" == "x86_64" ]]; then - unapply_performance_tweaks + if [[ $performance_tweaks -eq 1 ]]; then + if [[ "$(uname --machine)" == "x86_64" ]]; then + unapply_performance_tweaks + fi + + echo $huge_pages_old |sudo tee /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages >/dev/null fi # do not leave behind env.list file