diff --git a/Cargo.lock b/Cargo.lock index cf18c4387dc..96b27b06aa1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,7 +9,7 @@ dependencies = [ "displaydoc", "thiserror", "vm-memory 0.16.0", - "zerocopy 0.8.7", + "zerocopy 0.8.9", ] [[package]] @@ -64,9 +64,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.17" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" dependencies = [ "anstyle", "anstyle-parse", @@ -79,9 +79,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anstyle-parse" @@ -111,6 +111,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "anyhow" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" + [[package]] name = "arrayvec" version = "0.7.6" @@ -129,7 +135,7 @@ version = "0.12.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf12b67bc9c5168f68655aadb2a12081689a58f1d9b1484705e4d1810ed6e4ac" dependencies = [ - "bindgen 0.69.4", + "bindgen 0.69.5", "cc", "cmake", "dunce", @@ -158,7 +164,7 @@ version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df7a4168111d7eb622a31b214057b8509c0a7e1794f44c546d742330dc793972" dependencies = [ - "bindgen 0.69.4", + "bindgen 0.69.5", "cc", "cmake", "dunce", @@ -204,14 +210,14 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.69.4" +version = "0.69.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" dependencies = [ "bitflags 2.6.0", "cexpr", "clang-sys", - "itertools 0.10.5", + "itertools 0.12.1", "lazy_static", "lazycell", "log", @@ -261,9 +267,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.31" +version = "1.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" +checksum = "baee610e9452a8f6f0a1b6194ec09ff9e2d85dea54432acdae41aa0761c95d70" dependencies = [ "jobserver", "libc", @@ -707,9 +713,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.0" +version = "0.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" [[package]] name = "heck" @@ -783,6 +789,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.13.0" @@ -961,6 +976,32 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1" +[[package]] +name = "mshv-bindings" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "576504619272a742fa7b75e69c9cd92520df5b4b66181c55e0d3eeb10d8341f8" +dependencies = [ + "libc", + "num_enum", + "serde", + "serde_derive", + "vmm-sys-util", + "zerocopy 0.8.9", +] + +[[package]] +name = "mshv-ioctls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccd62dfa7e0448b49700744f4d23e28ed7a49e83087ba6d7c06c4ee18b8821c" +dependencies = [ + "libc", + "mshv-bindings", + "thiserror", + "vmm-sys-util", +] + [[package]] name = "nix" version = "0.27.1" @@ -1004,6 +1045,26 @@ dependencies = [ "libm", ] +[[package]] +name = "num_enum" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179" +dependencies = [ + "num_enum_derive", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "once_cell" version = "1.20.2" @@ -1028,6 +1089,26 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pci" +version = "0.1.0" +dependencies = [ + "anyhow", + "byteorder", + "kvm-bindings", + "kvm-ioctls", + "libc", + "log", + "serde", + "thiserror", + "vfio-bindings 0.2.0", + "vfio-ioctls", + "vm-device", + "vm-memory 0.16.0", + "vm-system-allocator", + "vmm-sys-util", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1187,9 +1268,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" -version = "0.38.38" +version = "0.38.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa260229e6538e52293eeb577aabd09945a09d6d9cc0fc550ed7529056c2e32a" +checksum = "375116bee2be9ed569afe2154ea6a99dfdffd257f533f187498c2a8f5feaf4ee" dependencies = [ "bitflags 2.6.0", "errno", @@ -1323,9 +1404,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.85" +version = "2.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" dependencies = [ "proc-macro2", "quote", @@ -1334,18 +1415,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.65" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d11abd9594d9b38965ef50805c5e469ca9cc6f197f883f717e0269a3057b3d5" +checksum = "02dd99dc800bbb97186339685293e1cc5d9df1f8fae2d0aecd9ff1c77efea892" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.65" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602" +checksum = "a7c61ec9a6f64d2793d8a45faba21efbe3ced62a886d44c36a009b2b519b4c7e" dependencies = [ "proc-macro2", "quote", @@ -1519,6 +1600,38 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "vfio-bindings" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a21f546f2bda37f5a8cfb138c87f95b8e34d2d78d6a7a92ba3785f4e08604a7" +dependencies = [ + "vmm-sys-util", +] + +[[package]] +name = "vfio-bindings" +version = "0.4.0" +source = "git+https://github.com/rust-vmm/vfio?branch=main#b135b8305c2cc8ec333e0cf77a780445cc98dcee" + +[[package]] +name = "vfio-ioctls" +version = "0.2.0" +source = "git+https://github.com/rust-vmm/vfio?branch=main#b135b8305c2cc8ec333e0cf77a780445cc98dcee" +dependencies = [ + "byteorder", + "kvm-bindings", + "kvm-ioctls", + "libc", + "log", + "mshv-bindings", + "mshv-ioctls", + "thiserror", + "vfio-bindings 0.4.0", + "vm-memory 0.16.0", + "vmm-sys-util", +] + [[package]] name = "vhost" version = "0.12.1" @@ -1542,6 +1655,18 @@ dependencies = [ "thiserror", ] +[[package]] +name = "vm-device" +version = "0.1.0" +dependencies = [ + "anyhow", + "serde", + "thiserror", + "vfio-ioctls", + "vm-memory 0.16.0", + "vmm-sys-util", +] + [[package]] name = "vm-fdt" version = "0.3.0" @@ -1576,17 +1701,27 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3428ee25acbfc75ed14600f2043876e0889cbd57c39dd441191417377cdceda0" +[[package]] +name = "vm-system-allocator" +version = "0.1.0" +dependencies = [ + "libc", + "vm-memory 0.16.0", +] + [[package]] name = "vmm" version = "0.1.0" dependencies = [ "acpi_tables", "aes-gcm", + "anyhow", "arrayvec", "aws-lc-rs", "base64", "bincode", "bitflags 2.6.0", + "byteorder", "crc64", "criterion", "derive_more", @@ -1605,6 +1740,7 @@ dependencies = [ "log-instrument", "memfd", "micro_http", + "pci", "proptest", "seccompiler", "semver", @@ -1615,13 +1751,17 @@ dependencies = [ "timerfd", "userfaultfd", "utils", + "uuid", + "vfio-ioctls", "vhost", "vm-allocator", + "vm-device", "vm-fdt", "vm-memory 0.16.0", "vm-superio", + "vm-system-allocator", "vmm-sys-util", - "zerocopy 0.8.7", + "zerocopy 0.8.9", ] [[package]] @@ -1798,11 +1938,11 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.7" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb3da5f7220f919a6c7af7c856435a68ee1582fd7a77aa72936257d8335bd6f6" +checksum = "f49e690f8f352f4a9ee8679a8c5921f42ffd0d6d6413a0a66b8e81cf524e109c" dependencies = [ - "zerocopy-derive 0.8.7", + "zerocopy-derive 0.8.9", ] [[package]] @@ -1818,9 +1958,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.8.7" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e5f54f3cc93cd80745404626681b4b9fca9a867bad5a8424b618eb0db1ae6ea" +checksum = "fa732fcc881df7a6fbe8e3ed17baadece53b379ad58fe2633396b1a2b108a7b1" dependencies = [ "proc-macro2", "quote", diff --git a/src/acpi-tables/src/lib.rs b/src/acpi-tables/src/lib.rs index 301a2d1cc95..ee24b1a2a60 100644 --- a/src/acpi-tables/src/lib.rs +++ b/src/acpi-tables/src/lib.rs @@ -10,6 +10,7 @@ pub mod aml; pub mod dsdt; pub mod fadt; pub mod madt; +pub mod mcfg; pub mod rsdp; pub mod xsdt; @@ -17,6 +18,7 @@ pub use aml::Aml; pub use dsdt::Dsdt; pub use fadt::Fadt; pub use madt::Madt; +pub use mcfg::Mcfg; pub use rsdp::Rsdp; pub use xsdt::Xsdt; use zerocopy::little_endian::{U32, U64}; diff --git a/src/acpi-tables/src/mcfg.rs b/src/acpi-tables/src/mcfg.rs new file mode 100644 index 00000000000..e914f0cae7b --- /dev/null +++ b/src/acpi-tables/src/mcfg.rs @@ -0,0 +1,78 @@ +// Copyright © 2019 Intel Corporation +// Copyright © 2023 Rivos, Inc. +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::mem::size_of; +use vm_memory::{Bytes, GuestAddress, GuestMemory}; +use zerocopy::{Immutable, IntoBytes}; + +use crate::{checksum, Result, Sdt, SdtHeader}; + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Default, Debug, IntoBytes, Clone, Copy, Immutable)] +struct PciRangeEntry { + pub base_address: u64, + pub segment: u16, + pub start: u8, + pub end: u8, + _reserved: u32, +} + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Debug, Default, IntoBytes, Immutable)] +pub struct Mcfg { + header: SdtHeader, + _reserved: u64, + pci_range_entry: PciRangeEntry, +} + +impl Mcfg { + pub fn new( + oem_id: [u8; 6], + oem_table_id: [u8; 8], + oem_revision: u32, + pci_mmio_config_addr: u64 + ) -> Self { + let header = SdtHeader::new( + *b"MCFG", + size_of::() + .try_into() + .unwrap(), + 1, + oem_id, + oem_table_id, + oem_revision, + ); + + let mut mcfg = Mcfg { + header, + pci_range_entry: PciRangeEntry { + base_address: pci_mmio_config_addr, + segment: 0, + start: 0, + end: 0, + ..Default::default() + }, + ..Default::default() + }; + + mcfg.header.checksum = checksum(&[mcfg.as_bytes()]); + + mcfg + } +} + +impl Sdt for Mcfg { + fn len(&self) -> usize { + self.as_bytes().len() + } + + fn write_to_guest(&mut self, mem: &M, address: GuestAddress) -> Result<()> { + mem.write_slice(self.as_bytes(), address)?; + Ok(()) + } +} diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml new file mode 100644 index 00000000000..d0f3f6f9b31 --- /dev/null +++ b/src/pci/Cargo.toml @@ -0,0 +1,26 @@ +[package] +authors = ["Samuel Ortiz "] +edition = "2021" +name = "pci" +version = "0.1.0" + +[features] +default = [] +kvm = ["vfio-ioctls/kvm"] +mshv = ["vfio-ioctls/mshv"] + +[dependencies] +anyhow = "1.0.87" +byteorder = "1.5.0" +vmm-sys-util = ">=0.3.1" +libc = "0.2.158" +log = { version = "0.4.22", features = ["std", "serde"] } +vm-memory = { version = "0.16.0", features = ["backend-mmap", "backend-bitmap"] } +vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", branch = "main" } +serde = { version = "1.0.208", features = ["derive"] } +thiserror = "1.0.62" +vfio-bindings = { version = "0.2.0", features = ["fam-wrappers"] } +kvm-bindings = { version = "0.10.0", features = ["fam-wrappers"] } +kvm-ioctls = "0.19.0" +vm-device = { path = "../vm-device"} +vm-system-allocator = { path = "../vm-system-allocator" } diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs new file mode 100644 index 00000000000..fe70fe6d6fc --- /dev/null +++ b/src/pci/src/bus.rs @@ -0,0 +1,465 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::collections::HashMap; +use std::fmt::Debug; +use std::ops::DerefMut; +use std::sync::{Arc, Barrier, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; + +use crate::configuration::{ + PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, +}; +use crate::device::{DeviceRelocation, Error as PciDeviceError, PciDevice}; +use crate::PciBarConfiguration; + +const VENDOR_ID_INTEL: u16 = 0x8086; +const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; +const NUM_DEVICE_IDS: usize = 32; + +/// Errors for device manager. +#[derive(Debug)] +pub enum PciRootError { + /// Could not allocate device address space for the device. + AllocateDeviceAddrs(PciDeviceError), + /// Could not allocate an IRQ number. + AllocateIrq, + /// Could not add a device to the port io bus. + PioInsert(vm_device::BusError), + /// Could not add a device to the mmio bus. + MmioInsert(vm_device::BusError), + /// Could not find an available device slot on the PCI bus. + NoPciDeviceSlotAvailable, + /// Invalid PCI device identifier provided. + InvalidPciDeviceSlot(usize), + /// Valid PCI device identifier but already used. + AlreadyInUsePciDeviceSlot(usize), +} +pub type Result = std::result::Result; + +/// Emulates the PCI Root bridge device. +pub struct PciRoot { + /// Configuration space. + config: PciConfiguration, +} + +impl Debug for PciRoot { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciRoot") + // TODO + .finish() + } +} + +impl PciRoot { + /// Create an empty PCI root bridge. + pub fn new(config: Option) -> Self { + if let Some(config) = config { + PciRoot { config } + } else { + PciRoot { + config: PciConfiguration::new( + VENDOR_ID_INTEL, + DEVICE_ID_INTEL_VIRT_PCIE_HOST, + 0, + PciClassCode::BridgeDevice, + &PciBridgeSubclass::HostBridge, + None, + PciHeaderType::Device, + 0, + 0, + None, + None, + ), + } + } + } +} + +impl PciDevice for PciRoot { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.config.write_config_register(reg_idx, offset, data); + None + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.config.read_reg(reg_idx) + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn id(&self) -> Option { + None + } +} + +pub struct PciBus { + /// Devices attached to this bus. + /// Device 0 is host bridge. + devices: HashMap>>, + device_reloc: Arc, + device_ids: Vec, +} + +impl PciBus { + pub fn new(pci_root: PciRoot, device_reloc: Arc) -> Self { + let mut devices: HashMap>> = HashMap::new(); + let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; + + devices.insert(0, Arc::new(Mutex::new(pci_root))); + device_ids[0] = true; + + PciBus { + devices, + device_reloc, + device_ids, + } + } + + pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { + self.devices.insert(device_id, device); + Ok(()) + } + + pub fn remove_by_device(&mut self, device: &Arc>) -> Result<()> { + self.devices.retain(|_, dev| !Arc::ptr_eq(dev, device)); + Ok(()) + } + + pub fn next_device_id(&mut self) -> Result { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u32); + } + } + + Err(PciRootError::NoPciDeviceSlotAvailable) + } + + pub fn get_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + if !self.device_ids[id] { + self.device_ids[id] = true; + Ok(()) + } else { + Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } + + pub fn put_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + self.device_ids[id] = false; + Ok(()) + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } +} + +pub struct PciConfigIo { + /// Config space register. + config_address: u32, + pci_bus: Arc>, +} + +impl Debug for PciConfigIo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciConfigIo") + // TODO + .finish() + } +} + +impl PciConfigIo { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigIo { + config_address: 0, + pci_bus, + } + } + + pub fn config_space_read(&self) -> u32 { + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return 0xffff_ffff; + } + + let (bus, device, function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + // Don't support multi-function devices. + if function > 0 { + return 0xffff_ffff; + } + + self.pci_bus + .as_ref() + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + pub fn config_space_write(&mut self, offset: u64, data: &[u8]) -> Option> { + if offset as usize + data.len() > 4 { + return None; + } + + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return None; + } + + let (bus, device, _function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return None; + } + + let pci_bus = self.pci_bus.as_ref().lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. + if let Some(params) = device.detect_bar_reprogramming(register, data) { + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } + } + + // Update the register value + device.write_config_register(register, offset, data) + } else { + None + } + } + + fn set_config_address(&mut self, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + let (mask, value): (u32, u32) = match data.len() { + 1 => ( + 0x0000_00ff << (offset * 8), + u32::from(data[0]) << (offset * 8), + ), + 2 => ( + 0x0000_ffff << (offset * 16), + (u32::from(data[1]) << 8 | u32::from(data[0])) << (offset * 16), + ), + 4 => (0xffff_ffff, LittleEndian::read_u32(data)), + _ => return, + }; + self.config_address = (self.config_address & !mask) | value; + } + + pub fn bus_read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // `offset` is relative to 0xcf8 + let value = match offset { + 0..=3 => self.config_address, + 4..=7 => self.config_space_read(), + _ => 0xffff_ffff, + }; + + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end <= 4 { + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } else { + for d in data { + *d = 0xff; + } + } + } + + pub fn bus_write(&mut self, _base: u64, offset: u64, data: &[u8]) { + // `offset` is relative to 0xcf8 + match offset { + o @ 0..=3 => self.set_config_address(o, data), + o @ 4..=7 => {self.config_space_write(o - 4, data);}, + _ => (), + } + } +} + +/// Emulates PCI memory-mapped configuration access mechanism. +pub struct PciConfigMmio { + pci_bus: Arc>, +} + +impl Debug for PciConfigMmio { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciConfigMmio") + // TODO + .finish() + } +} + +impl PciConfigMmio { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigMmio { pci_bus } + } + + fn config_space_read(&self, config_address: u32) -> u32 { + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + self.pci_bus + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + fn config_space_write(&mut self, config_address: u32, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return; + } + + let pci_bus = self.pci_bus.lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. + if let Some(params) = device.detect_bar_reprogramming(register, data) { + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } + } + + // Update the register value + device.write_config_register(register, offset, data); + } + } + + pub fn bus_read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end > 4 || offset > u64::from(u32::MAX) { + for d in data { + *d = 0xff; + } + return; + } + + let value = self.config_space_read(offset as u32); + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } + + pub fn bus_write(&mut self, _base: u64, offset: u64, data: &[u8]) { + if offset > u64::from(u32::MAX) { + return; + } + self.config_space_write(offset as u32, offset % 4, data); + } +} + +fn shift_and_mask(value: u32, offset: usize, mask: u32) -> usize { + ((value >> offset) & mask) as usize +} + +// Parse the MMIO address offset to a (bus, device, function, register) tuple. +// See section 7.2.2 PCI Express Enhanced Configuration Access Mechanism (ECAM) +// from the Pci Express Base Specification Revision 5.0 Version 1.0. +fn parse_mmio_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 20; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 15; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 12; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3ff; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} + +// Parse the CONFIG_ADDRESS register to a (bus, device, function, register) tuple. +fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 16; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 11; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 8; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3f; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs new file mode 100644 index 00000000000..38210a37bc3 --- /dev/null +++ b/src/pci/src/configuration.rs @@ -0,0 +1,1251 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::fmt::{self, Display}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_device::PciBarType; + +use crate::device::BarReprogrammingParams; +use crate::{MsixConfig, PciInterruptPin}; + +// The number of 32bit registers in the config space, 4096 bytes. +const NUM_CONFIGURATION_REGISTERS: usize = 1024; + +const STATUS_REG: usize = 1; +const STATUS_REG_CAPABILITIES_USED_MASK: u32 = 0x0010_0000; +const BAR0_REG: usize = 4; +const ROM_BAR_REG: usize = 12; +const ROM_BAR_IDX: usize = 6; +const BAR_IO_ADDR_MASK: u32 = 0xffff_fffc; +const BAR_MEM_ADDR_MASK: u32 = 0xffff_fff0; +const ROM_BAR_ADDR_MASK: u32 = 0xffff_f800; +const MSI_CAPABILITY_REGISTER_MASK: u32 = 0x0071_0000; +const MSIX_CAPABILITY_REGISTER_MASK: u32 = 0xc000_0000; +const NUM_BAR_REGS: usize = 6; +const CAPABILITY_LIST_HEAD_OFFSET: usize = 0x34; +const FIRST_CAPABILITY_OFFSET: usize = 0x40; +const CAPABILITY_MAX_OFFSET: usize = 192; + +const INTERRUPT_LINE_PIN_REG: usize = 15; + +pub const PCI_CONFIGURATION_ID: &str = "pci_configuration"; + +/// Represents the types of PCI headers allowed in the configuration registers. +#[derive(Copy, Clone)] +pub enum PciHeaderType { + Device, + Bridge, +} + +/// Classes of PCI nodes. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciClassCode { + TooOld, + MassStorage, + NetworkController, + DisplayController, + MultimediaController, + MemoryController, + BridgeDevice, + SimpleCommunicationController, + BaseSystemPeripheral, + InputDevice, + DockingStation, + Processor, + SerialBusController, + WirelessController, + IntelligentIoController, + EncryptionController, + DataAcquisitionSignalProcessing, + Other = 0xff, +} + +impl PciClassCode { + pub fn get_register_value(self) -> u8 { + self as u8 + } +} + +/// A PCI subclass. Each class in `PciClassCode` can specify a unique set of subclasses. This trait +/// is implemented by each subclass. It allows use of a trait object to generate configurations. +pub trait PciSubclass { + /// Convert this subclass to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Subclasses of the MultimediaController class. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMultimediaSubclass { + VideoController = 0x00, + AudioController = 0x01, + TelephonyDevice = 0x02, + AudioDevice = 0x03, + Other = 0x80, +} + +impl PciSubclass for PciMultimediaSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclasses of the BridgeDevice +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciBridgeSubclass { + HostBridge = 0x00, + IsaBridge = 0x01, + EisaBridge = 0x02, + McaBridge = 0x03, + PciToPciBridge = 0x04, + PcmciaBridge = 0x05, + NuBusBridge = 0x06, + CardBusBridge = 0x07, + RacEwayBridge = 0x08, + PciToPciSemiTransparentBridge = 0x09, + InfiniBrandToPciHostBridge = 0x0a, + OtherBridgeDevice = 0x80, +} + +impl PciSubclass for PciBridgeSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclass of the SerialBus +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciSerialBusSubClass { + Firewire = 0x00, + Accessbus = 0x01, + Ssa = 0x02, + Usb = 0x03, +} + +impl PciSubclass for PciSerialBusSubClass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Mass Storage Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMassStorageSubclass { + ScsiStorage = 0x00, + IdeInterface = 0x01, + FloppyController = 0x02, + IpiController = 0x03, + RaidController = 0x04, + AtaController = 0x05, + SataController = 0x06, + SerialScsiController = 0x07, + NvmController = 0x08, + MassStorage = 0x80, +} + +impl PciSubclass for PciMassStorageSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Network Controller Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciNetworkControllerSubclass { + EthernetController = 0x00, + TokenRingController = 0x01, + FddiController = 0x02, + AtmController = 0x03, + IsdnController = 0x04, + WorldFipController = 0x05, + PicmgController = 0x06, + InfinibandController = 0x07, + FabricController = 0x08, + NetworkController = 0x80, +} + +impl PciSubclass for PciNetworkControllerSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Trait to define a PCI class programming interface +/// +/// Each combination of `PciClassCode` and `PciSubclass` can specify a +/// set of register-level programming interfaces. +/// This trait is implemented by each programming interface. +/// It allows use of a trait object to generate configurations. +pub trait PciProgrammingInterface { + /// Convert this programming interface to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Types of PCI capabilities. +#[derive(PartialEq, Eq, Copy, Clone)] +#[allow(dead_code)] +#[allow(non_camel_case_types)] +#[repr(u8)] +pub enum PciCapabilityId { + ListId = 0, + PowerManagement = 0x01, + AcceleratedGraphicsPort = 0x02, + VitalProductData = 0x03, + SlotIdentification = 0x04, + MessageSignalledInterrupts = 0x05, + CompactPciHotSwap = 0x06, + PciX = 0x07, + HyperTransport = 0x08, + VendorSpecific = 0x09, + Debugport = 0x0A, + CompactPciCentralResourceControl = 0x0B, + PciStandardHotPlugController = 0x0C, + BridgeSubsystemVendorDeviceId = 0x0D, + AgpTargetPciPcibridge = 0x0E, + SecureDevice = 0x0F, + PciExpress = 0x10, + MsiX = 0x11, + SataDataIndexConf = 0x12, + PciAdvancedFeatures = 0x13, + PciEnhancedAllocation = 0x14, +} + +impl From for PciCapabilityId { + fn from(c: u8) -> Self { + match c { + 0 => PciCapabilityId::ListId, + 0x01 => PciCapabilityId::PowerManagement, + 0x02 => PciCapabilityId::AcceleratedGraphicsPort, + 0x03 => PciCapabilityId::VitalProductData, + 0x04 => PciCapabilityId::SlotIdentification, + 0x05 => PciCapabilityId::MessageSignalledInterrupts, + 0x06 => PciCapabilityId::CompactPciHotSwap, + 0x07 => PciCapabilityId::PciX, + 0x08 => PciCapabilityId::HyperTransport, + 0x09 => PciCapabilityId::VendorSpecific, + 0x0A => PciCapabilityId::Debugport, + 0x0B => PciCapabilityId::CompactPciCentralResourceControl, + 0x0C => PciCapabilityId::PciStandardHotPlugController, + 0x0D => PciCapabilityId::BridgeSubsystemVendorDeviceId, + 0x0E => PciCapabilityId::AgpTargetPciPcibridge, + 0x0F => PciCapabilityId::SecureDevice, + 0x10 => PciCapabilityId::PciExpress, + 0x11 => PciCapabilityId::MsiX, + 0x12 => PciCapabilityId::SataDataIndexConf, + 0x13 => PciCapabilityId::PciAdvancedFeatures, + 0x14 => PciCapabilityId::PciEnhancedAllocation, + _ => PciCapabilityId::ListId, + } + } +} + +/// Types of PCI Express capabilities. +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +#[allow(dead_code)] +#[repr(u16)] +pub enum PciExpressCapabilityId { + NullCapability = 0x0000, + AdvancedErrorReporting = 0x0001, + VirtualChannelMultiFunctionVirtualChannelNotPresent = 0x0002, + DeviceSerialNumber = 0x0003, + PowerBudgeting = 0x0004, + RootComplexLinkDeclaration = 0x0005, + RootComplexInternalLinkControl = 0x0006, + RootComplexEventCollectorEndpointAssociation = 0x0007, + MultiFunctionVirtualChannel = 0x0008, + VirtualChannelMultiFunctionVirtualChannelPresent = 0x0009, + RootComplexRegisterBlock = 0x000a, + VendorSpecificExtendedCapability = 0x000b, + ConfigurationAccessCorrelation = 0x000c, + AccessControlServices = 0x000d, + AlternativeRoutingIdentificationInterpretation = 0x000e, + AddressTranslationServices = 0x000f, + SingleRootIoVirtualization = 0x0010, + DeprecatedMultiRootIoVirtualization = 0x0011, + Multicast = 0x0012, + PageRequestInterface = 0x0013, + ReservedForAmd = 0x0014, + ResizeableBar = 0x0015, + DynamicPowerAllocation = 0x0016, + ThpRequester = 0x0017, + LatencyToleranceReporting = 0x0018, + SecondaryPciExpress = 0x0019, + ProtocolMultiplexing = 0x001a, + ProcessAddressSpaceId = 0x001b, + LnRequester = 0x001c, + DownstreamPortContainment = 0x001d, + L1PmSubstates = 0x001e, + PrecisionTimeMeasurement = 0x001f, + PciExpressOverMphy = 0x0020, + FRSQueueing = 0x0021, + ReadinessTimeReporting = 0x0022, + DesignatedVendorSpecificExtendedCapability = 0x0023, + VfResizeableBar = 0x0024, + DataLinkFeature = 0x0025, + PhysicalLayerSixteenGts = 0x0026, + LaneMarginingAtTheReceiver = 0x0027, + HierarchyId = 0x0028, + NativePcieEnclosureManagement = 0x0029, + PhysicalLayerThirtyTwoGts = 0x002a, + AlternateProtocol = 0x002b, + SystemFirmwareIntermediary = 0x002c, + ShadowFunctions = 0x002d, + DataObjectExchange = 0x002e, + Reserved = 0x002f, + ExtendedCapabilitiesAbsence = 0xffff, +} + +impl From for PciExpressCapabilityId { + fn from(c: u16) -> Self { + match c { + 0x0000 => PciExpressCapabilityId::NullCapability, + 0x0001 => PciExpressCapabilityId::AdvancedErrorReporting, + 0x0002 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelNotPresent, + 0x0003 => PciExpressCapabilityId::DeviceSerialNumber, + 0x0004 => PciExpressCapabilityId::PowerBudgeting, + 0x0005 => PciExpressCapabilityId::RootComplexLinkDeclaration, + 0x0006 => PciExpressCapabilityId::RootComplexInternalLinkControl, + 0x0007 => PciExpressCapabilityId::RootComplexEventCollectorEndpointAssociation, + 0x0008 => PciExpressCapabilityId::MultiFunctionVirtualChannel, + 0x0009 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelPresent, + 0x000a => PciExpressCapabilityId::RootComplexRegisterBlock, + 0x000b => PciExpressCapabilityId::VendorSpecificExtendedCapability, + 0x000c => PciExpressCapabilityId::ConfigurationAccessCorrelation, + 0x000d => PciExpressCapabilityId::AccessControlServices, + 0x000e => PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation, + 0x000f => PciExpressCapabilityId::AddressTranslationServices, + 0x0010 => PciExpressCapabilityId::SingleRootIoVirtualization, + 0x0011 => PciExpressCapabilityId::DeprecatedMultiRootIoVirtualization, + 0x0012 => PciExpressCapabilityId::Multicast, + 0x0013 => PciExpressCapabilityId::PageRequestInterface, + 0x0014 => PciExpressCapabilityId::ReservedForAmd, + 0x0015 => PciExpressCapabilityId::ResizeableBar, + 0x0016 => PciExpressCapabilityId::DynamicPowerAllocation, + 0x0017 => PciExpressCapabilityId::ThpRequester, + 0x0018 => PciExpressCapabilityId::LatencyToleranceReporting, + 0x0019 => PciExpressCapabilityId::SecondaryPciExpress, + 0x001a => PciExpressCapabilityId::ProtocolMultiplexing, + 0x001b => PciExpressCapabilityId::ProcessAddressSpaceId, + 0x001c => PciExpressCapabilityId::LnRequester, + 0x001d => PciExpressCapabilityId::DownstreamPortContainment, + 0x001e => PciExpressCapabilityId::L1PmSubstates, + 0x001f => PciExpressCapabilityId::PrecisionTimeMeasurement, + 0x0020 => PciExpressCapabilityId::PciExpressOverMphy, + 0x0021 => PciExpressCapabilityId::FRSQueueing, + 0x0022 => PciExpressCapabilityId::ReadinessTimeReporting, + 0x0023 => PciExpressCapabilityId::DesignatedVendorSpecificExtendedCapability, + 0x0024 => PciExpressCapabilityId::VfResizeableBar, + 0x0025 => PciExpressCapabilityId::DataLinkFeature, + 0x0026 => PciExpressCapabilityId::PhysicalLayerSixteenGts, + 0x0027 => PciExpressCapabilityId::LaneMarginingAtTheReceiver, + 0x0028 => PciExpressCapabilityId::HierarchyId, + 0x0029 => PciExpressCapabilityId::NativePcieEnclosureManagement, + 0x002a => PciExpressCapabilityId::PhysicalLayerThirtyTwoGts, + 0x002b => PciExpressCapabilityId::AlternateProtocol, + 0x002c => PciExpressCapabilityId::SystemFirmwareIntermediary, + 0x002d => PciExpressCapabilityId::ShadowFunctions, + 0x002e => PciExpressCapabilityId::DataObjectExchange, + 0xffff => PciExpressCapabilityId::ExtendedCapabilitiesAbsence, + _ => PciExpressCapabilityId::Reserved, + } + } +} + +/// A PCI capability list. Devices can optionally specify capabilities in their configuration space. +pub trait PciCapability { + fn bytes(&self) -> &[u8]; + fn id(&self) -> PciCapabilityId; +} + +fn encode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!(bar_size - 1)); + } + None +} + +fn decode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +fn encode_64_bits_bar_size(bar_size: u64) -> Option<(u32, u32)> { + if bar_size > 0 { + let result = !(bar_size - 1); + let result_hi = (result >> 32) as u32; + let result_lo = (result & 0xffff_ffff) as u32; + return Some((result_hi, result_lo)); + } + None +} + +fn decode_64_bits_bar_size(bar_size_hi: u32, bar_size_lo: u32) -> Option { + let bar_size: u64 = ((bar_size_hi as u64) << 32) | (bar_size_lo as u64); + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] +struct PciBar { + addr: u32, + size: u32, + used: bool, + r#type: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct PciConfigurationState { + registers: Vec, + writable_bits: Vec, + bars: Vec, + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, +} + +/// Contains the configuration space of a PCI node. +/// +/// See the [specification](https://en.wikipedia.org/wiki/PCI_configuration_space). +/// The configuration space is accessed with DWORD reads and writes from the guest. +pub struct PciConfiguration { + registers: [u32; NUM_CONFIGURATION_REGISTERS], + writable_bits: [u32; NUM_CONFIGURATION_REGISTERS], // writable bits for each register. + bars: [PciBar; NUM_BAR_REGS], + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + // Contains the byte offset and size of the last capability. + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, + msix_config: Option>>, +} + +/// See pci_regs.h in kernel +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +pub enum PciBarRegionType { + Memory32BitRegion = 0, + IoRegion = 0x01, + Memory64BitRegion = 0x04, +} + +impl From for PciBarRegionType { + fn from(type_: PciBarType) -> Self { + match type_ { + PciBarType::Io => PciBarRegionType::IoRegion, + PciBarType::Mmio32 => PciBarRegionType::Memory32BitRegion, + PciBarType::Mmio64 => PciBarRegionType::Memory64BitRegion, + } + } +} + +impl From for PciBarType { + fn from(val: PciBarRegionType) -> Self { + match val { + PciBarRegionType::IoRegion => PciBarType::Io, + PciBarRegionType::Memory32BitRegion => PciBarType::Mmio32, + PciBarRegionType::Memory64BitRegion => PciBarType::Mmio64, + } + } +} + +#[derive(Copy, Clone)] +pub enum PciBarPrefetchable { + NotPrefetchable = 0, + Prefetchable = 0x08, +} + +impl From for bool { + fn from(val: PciBarPrefetchable) -> Self { + match val { + PciBarPrefetchable::NotPrefetchable => false, + PciBarPrefetchable::Prefetchable => true, + } + } +} + +#[derive(Copy, Clone)] +pub struct PciBarConfiguration { + addr: u64, + size: u64, + idx: usize, + region_type: PciBarRegionType, + prefetchable: PciBarPrefetchable, +} + +#[derive(Debug)] +pub enum Error { + BarAddressInvalid(u64, u64), + BarInUse(usize), + BarInUse64(usize), + BarInvalid(usize), + BarInvalid64(usize), + BarSizeInvalid(u64), + CapabilityEmpty, + CapabilityLengthInvalid(usize), + CapabilitySpaceFull(usize), + Decode32BarSize, + Decode64BarSize, + Encode32BarSize, + Encode64BarSize, + RomBarAddressInvalid(u64, u64), + RomBarInUse(usize), + RomBarInvalid(usize), + RomBarSizeInvalid(u64), +} +pub type Result = std::result::Result; + +impl std::error::Error for Error {} + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + match self { + BarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + BarInUse(b) => write!(f, "bar {b} already used"), + BarInUse64(b) => write!(f, "64bit bar {b} already used(requires two regs)"), + BarInvalid(b) => write!(f, "bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + BarInvalid64(b) => write!( + f, + "64bitbar {} invalid, requires two regs, max {}", + b, + NUM_BAR_REGS - 1 + ), + BarSizeInvalid(s) => write!(f, "bar address {s} not a power of two"), + CapabilityEmpty => write!(f, "empty capabilities are invalid"), + CapabilityLengthInvalid(l) => write!(f, "Invalid capability length {l}"), + CapabilitySpaceFull(s) => write!(f, "capability of size {s} doesn't fit"), + Decode32BarSize => write!(f, "failed to decode 32 bits BAR size"), + Decode64BarSize => write!(f, "failed to decode 64 bits BAR size"), + Encode32BarSize => write!(f, "failed to encode 32 bits BAR size"), + Encode64BarSize => write!(f, "failed to encode 64 bits BAR size"), + RomBarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + RomBarInUse(b) => write!(f, "rom bar {b} already used"), + RomBarInvalid(b) => write!(f, "rom bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + RomBarSizeInvalid(s) => write!(f, "rom bar address {s} not a power of two"), + } + } +} + +impl PciConfiguration { + #[allow(clippy::too_many_arguments)] + pub fn new( + vendor_id: u16, + device_id: u16, + revision_id: u8, + class_code: PciClassCode, + subclass: &dyn PciSubclass, + programming_interface: Option<&dyn PciProgrammingInterface>, + header_type: PciHeaderType, + subsystem_vendor_id: u16, + subsystem_id: u16, + msix_config: Option>>, + state: Option, + ) -> Self { + let ( + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + ) = if let Some(state) = state { + ( + state.registers.try_into().unwrap(), + state.writable_bits.try_into().unwrap(), + state.bars.try_into().unwrap(), + state.rom_bar_addr, + state.rom_bar_size, + state.rom_bar_used, + state.last_capability, + state.msix_cap_reg_idx, + ) + } else { + let mut registers = [0u32; NUM_CONFIGURATION_REGISTERS]; + let mut writable_bits = [0u32; NUM_CONFIGURATION_REGISTERS]; + registers[0] = u32::from(device_id) << 16 | u32::from(vendor_id); + // TODO(dverkamp): Status should be write-1-to-clear + writable_bits[1] = 0x0000_ffff; // Status (r/o), command (r/w) + let pi = if let Some(pi) = programming_interface { + pi.get_register_value() + } else { + 0 + }; + registers[2] = u32::from(class_code.get_register_value()) << 24 + | u32::from(subclass.get_register_value()) << 16 + | u32::from(pi) << 8 + | u32::from(revision_id); + writable_bits[3] = 0x0000_00ff; // Cacheline size (r/w) + match header_type { + PciHeaderType::Device => { + registers[3] = 0x0000_0000; // Header type 0 (device) + writable_bits[15] = 0x0000_00ff; // Interrupt line (r/w) + } + PciHeaderType::Bridge => { + registers[3] = 0x0001_0000; // Header type 1 (bridge) + writable_bits[9] = 0xfff0_fff0; // Memory base and limit + writable_bits[15] = 0xffff_00ff; // Bridge control (r/w), interrupt line (r/w) + } + }; + registers[11] = u32::from(subsystem_id) << 16 | u32::from(subsystem_vendor_id); + + ( + registers, + writable_bits, + [PciBar::default(); NUM_BAR_REGS], + 0, + 0, + false, + None, + None, + ) + }; + + PciConfiguration { + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + msix_config, + } + } + + fn state(&self) -> PciConfigurationState { + PciConfigurationState { + registers: self.registers.to_vec(), + writable_bits: self.writable_bits.to_vec(), + bars: self.bars.to_vec(), + rom_bar_addr: self.rom_bar_addr, + rom_bar_size: self.rom_bar_size, + rom_bar_used: self.rom_bar_used, + last_capability: self.last_capability, + msix_cap_reg_idx: self.msix_cap_reg_idx, + } + } + + /// Reads a 32bit register from `reg_idx` in the register map. + pub fn read_reg(&self, reg_idx: usize) -> u32 { + *(self.registers.get(reg_idx).unwrap_or(&0xffff_ffff)) + } + + /// Writes a 32bit register to `reg_idx` in the register map. + pub fn write_reg(&mut self, reg_idx: usize, value: u32) { + let mut mask = self.writable_bits[reg_idx]; + + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Handle very specific case where the BAR is being written with + // all 1's to retrieve the BAR size during next BAR reading. + if value == 0xffff_ffff { + mask &= self.bars[reg_idx - 4].size; + } + } else if reg_idx == ROM_BAR_REG { + // Handle very specific case where the BAR is being written with + // all 1's on bits 31-11 to retrieve the BAR size during next BAR + // reading. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + mask &= self.rom_bar_size; + } + } + + if let Some(r) = self.registers.get_mut(reg_idx) { + *r = (*r & !self.writable_bits[reg_idx]) | (value & mask); + } else { + warn!("bad PCI register write {}", reg_idx); + } + } + + /// Writes a 16bit word to `offset`. `offset` must be 16bit aligned. + pub fn write_word(&mut self, offset: usize, value: u16) { + let shift = match offset % 4 { + 0 => 0, + 2 => 16, + _ => { + warn!("bad PCI config write offset {}", offset); + return; + } + }; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = self.writable_bits[reg_idx]; + let mask = (0xffffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Writes a byte to `offset`. + pub fn write_byte(&mut self, offset: usize, value: u8) { + self.write_byte_internal(offset, value, true); + } + + /// Writes a byte to `offset`, optionally enforcing read-only bits. + fn write_byte_internal(&mut self, offset: usize, value: u8, apply_writable_mask: bool) { + let shift = (offset % 4) * 8; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = if apply_writable_mask { + self.writable_bits[reg_idx] + } else { + 0xffff_ffff + }; + let mask = (0xffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Adds a region specified by `config`. Configures the specified BAR(s) to + /// report this region and size to the guest kernel. Enforces a few constraints + /// (i.e, region size must be power of two, register not already used). + pub fn add_pci_bar(&mut self, config: &PciBarConfiguration) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = BAR0_REG + bar_idx; + + if self.bars[bar_idx].used { + return Err(Error::BarInUse(bar_idx)); + } + + if config.size.count_ones() != 1 { + return Err(Error::BarSizeInvalid(config.size)); + } + + if bar_idx >= NUM_BAR_REGS { + return Err(Error::BarInvalid(bar_idx)); + } + + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::BarAddressInvalid(config.addr, config.size))?; + match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::IoRegion => { + if end_addr > u64::from(u32::MAX) { + return Err(Error::BarAddressInvalid(config.addr, config.size)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + self.bars[bar_idx].size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + } + PciBarRegionType::Memory64BitRegion => { + if bar_idx + 1 >= NUM_BAR_REGS { + return Err(Error::BarInvalid64(bar_idx)); + } + + if self.bars[bar_idx + 1].used { + return Err(Error::BarInUse64(bar_idx)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + let (bar_size_hi, bar_size_lo) = + encode_64_bits_bar_size(config.size).ok_or(Error::Encode64BarSize)?; + + self.registers[reg_idx + 1] = (config.addr >> 32) as u32; + self.writable_bits[reg_idx + 1] = 0xffff_ffff; + self.bars[bar_idx + 1].addr = self.registers[reg_idx + 1]; + self.bars[bar_idx].size = bar_size_lo; + self.bars[bar_idx + 1].size = bar_size_hi; + self.bars[bar_idx + 1].used = true; + } + } + + let (mask, lower_bits) = match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => ( + BAR_MEM_ADDR_MASK, + config.prefetchable as u32 | config.region_type as u32, + ), + PciBarRegionType::IoRegion => (BAR_IO_ADDR_MASK, config.region_type as u32), + }; + + self.registers[reg_idx] = ((config.addr as u32) & mask) | lower_bits; + self.writable_bits[reg_idx] = mask; + self.bars[bar_idx].addr = self.registers[reg_idx]; + self.bars[bar_idx].used = true; + self.bars[bar_idx].r#type = Some(config.region_type); + + Ok(()) + } + + /// Adds rom expansion BAR. + pub fn add_pci_rom_bar(&mut self, config: &PciBarConfiguration, active: u32) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = ROM_BAR_REG; + + if self.rom_bar_used { + return Err(Error::RomBarInUse(bar_idx)); + } + + if config.size.count_ones() != 1 { + return Err(Error::RomBarSizeInvalid(config.size)); + } + + if bar_idx != ROM_BAR_IDX { + return Err(Error::RomBarInvalid(bar_idx)); + } + + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::RomBarAddressInvalid(config.addr, config.size))?; + + if end_addr > u64::from(u32::MAX) { + return Err(Error::RomBarAddressInvalid(config.addr, config.size)); + } + + self.registers[reg_idx] = (config.addr as u32) | active; + self.writable_bits[reg_idx] = ROM_BAR_ADDR_MASK; + self.rom_bar_addr = self.registers[reg_idx]; + self.rom_bar_size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + self.rom_bar_used = true; + + Ok(()) + } + + /// Returns the address of the given BAR region. + pub fn get_bar_addr(&self, bar_num: usize) -> u64 { + let bar_idx = BAR0_REG + bar_num; + + let mut addr = u64::from(self.bars[bar_num].addr & self.writable_bits[bar_idx]); + + if let Some(bar_type) = self.bars[bar_num].r#type { + if bar_type == PciBarRegionType::Memory64BitRegion { + addr |= u64::from(self.bars[bar_num + 1].addr) << 32; + } + } + + addr + } + + /// Configures the IRQ line and pin used by this device. + pub fn set_irq(&mut self, line: u8, pin: PciInterruptPin) { + // `pin` is 1-based in the pci config space. + let pin_idx = (pin as u32) + 1; + self.registers[INTERRUPT_LINE_PIN_REG] = (self.registers[INTERRUPT_LINE_PIN_REG] + & 0xffff_0000) + | (pin_idx << 8) + | u32::from(line); + } + + /// Adds the capability `cap_data` to the list of capabilities. + /// `cap_data` should include the two-byte PCI capability header (type, next), + /// but not populate it. Correct values will be generated automatically based + /// on `cap_data.id()`. + pub fn add_capability(&mut self, cap_data: &dyn PciCapability) -> Result { + let total_len = cap_data.bytes().len(); + // Check that the length is valid. + if cap_data.bytes().is_empty() { + return Err(Error::CapabilityEmpty); + } + let (cap_offset, tail_offset) = match self.last_capability { + Some((offset, len)) => (Self::next_dword(offset, len), offset + 1), + None => (FIRST_CAPABILITY_OFFSET, CAPABILITY_LIST_HEAD_OFFSET), + }; + let end_offset = cap_offset + .checked_add(total_len) + .ok_or(Error::CapabilitySpaceFull(total_len))?; + if end_offset > CAPABILITY_MAX_OFFSET { + return Err(Error::CapabilitySpaceFull(total_len)); + } + self.registers[STATUS_REG] |= STATUS_REG_CAPABILITIES_USED_MASK; + self.write_byte_internal(tail_offset, cap_offset as u8, false); + self.write_byte_internal(cap_offset, cap_data.id() as u8, false); + self.write_byte_internal(cap_offset + 1, 0, false); // Next pointer. + for (i, byte) in cap_data.bytes().iter().enumerate() { + self.write_byte_internal(cap_offset + i + 2, *byte, false); + } + self.last_capability = Some((cap_offset, total_len)); + + match cap_data.id() { + PciCapabilityId::MessageSignalledInterrupts => { + self.writable_bits[cap_offset / 4] = MSI_CAPABILITY_REGISTER_MASK; + } + PciCapabilityId::MsiX => { + self.msix_cap_reg_idx = Some(cap_offset / 4); + self.writable_bits[self.msix_cap_reg_idx.unwrap()] = MSIX_CAPABILITY_REGISTER_MASK; + } + _ => {} + } + + Ok(cap_offset) + } + + // Find the next aligned offset after the one given. + fn next_dword(offset: usize, len: usize) -> usize { + let next = offset + len; + (next + 3) & !3 + } + + pub fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + // Handle potential write to MSI-X message control register + if let Some(msix_cap_reg_idx) = self.msix_cap_reg_idx { + if let Some(msix_config) = &self.msix_config { + if msix_cap_reg_idx == reg_idx && offset == 2 && data.len() == 2 { + msix_config + .lock() + .unwrap() + .set_msg_ctl(LittleEndian::read_u16(data)); + } else if msix_cap_reg_idx == reg_idx && offset == 0 && data.len() == 4 { + msix_config + .lock() + .unwrap() + .set_msg_ctl((LittleEndian::read_u32(data) >> 16) as u16); + } + } + } + + match data.len() { + 1 => self.write_byte(reg_idx * 4 + offset as usize, data[0]), + 2 => self.write_word( + reg_idx * 4 + offset as usize, + u16::from(data[0]) | u16::from(data[1]) << 8, + ), + 4 => self.write_reg(reg_idx, LittleEndian::read_u32(data)), + _ => (), + } + } + + pub fn read_config_register(&self, reg_idx: usize) -> u32 { + self.read_reg(reg_idx) + } + + pub fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + if data.len() != 4 { + return None; + } + + let value = LittleEndian::read_u32(data); + + let mask = self.writable_bits[reg_idx]; + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Ignore the case where the BAR size is being asked for. + if value == 0xffff_ffff { + return None; + } + + let bar_idx = reg_idx - 4; + // Handle special case where the address being written is + // different from the address initially provided. This is a + // BAR reprogramming case which needs to be properly caught. + if let Some(bar_type) = self.bars[bar_idx].r#type { + // In case of 64 bits memory BAR, we don't do anything until + // the upper BAR is modified, otherwise we would be moving the + // BAR to a wrong location in memory. + if bar_type == PciBarRegionType::Memory64BitRegion { + return None; + } + + // Ignore the case where the value is unchanged. + if (value & mask) == (self.bars[bar_idx].addr & mask) { + return None; + } + + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.bars[bar_idx].addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.bars[bar_idx].size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = bar_type; + + self.bars[bar_idx].addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } else if (reg_idx > BAR0_REG) + && ((self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]) + != (self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]) + || (value & mask) != (self.bars[bar_idx].addr & mask)) + { + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.bars[bar_idx].addr & mask) << 32 + | u64::from(self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]); + let new_base = u64::from(value & mask) << 32 + | u64::from(self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]); + let len = + decode_64_bits_bar_size(self.bars[bar_idx].size, self.bars[bar_idx - 1].size) + .ok_or(Error::Decode64BarSize) + .unwrap(); + let region_type = PciBarRegionType::Memory64BitRegion; + + self.bars[bar_idx].addr = value; + self.bars[bar_idx - 1].addr = self.registers[reg_idx - 1]; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + } else if reg_idx == ROM_BAR_REG && (value & mask) != (self.rom_bar_addr & mask) { + // Ignore the case where the BAR size is being asked for. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + return None; + } + + info!( + "Detected ROM BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.rom_bar_addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.rom_bar_size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = PciBarRegionType::Memory32BitRegion; + + self.rom_bar_addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + + None + } +} + +impl Default for PciBarConfiguration { + fn default() -> Self { + PciBarConfiguration { + idx: 0, + addr: 0, + size: 0, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::NotPrefetchable, + } + } +} + +impl PciBarConfiguration { + pub fn new( + idx: usize, + size: u64, + region_type: PciBarRegionType, + prefetchable: PciBarPrefetchable, + ) -> Self { + PciBarConfiguration { + idx, + addr: 0, + size, + region_type, + prefetchable, + } + } + + #[must_use] + pub fn set_index(mut self, idx: usize) -> Self { + self.idx = idx; + self + } + + #[must_use] + pub fn set_address(mut self, addr: u64) -> Self { + self.addr = addr; + self + } + + #[must_use] + pub fn set_size(mut self, size: u64) -> Self { + self.size = size; + self + } + + #[must_use] + pub fn set_region_type(mut self, region_type: PciBarRegionType) -> Self { + self.region_type = region_type; + self + } + + #[must_use] + pub fn set_prefetchable(mut self, prefetchable: PciBarPrefetchable) -> Self { + self.prefetchable = prefetchable; + self + } + + pub fn idx(&self) -> usize { + self.idx + } + + pub fn addr(&self) -> u64 { + self.addr + } + + pub fn size(&self) -> u64 { + self.size + } + + pub fn region_type(&self) -> PciBarRegionType { + self.region_type + } + + pub fn prefetchable(&self) -> PciBarPrefetchable { + self.prefetchable + } +} + +#[cfg(test)] +mod tests { + use vm_memory::ByteValued; + + use super::*; + + #[repr(packed)] + #[derive(Clone, Copy, Default)] + #[allow(dead_code)] + struct TestCap { + len: u8, + foo: u8, + } + + // SAFETY: All members are simple numbers and any value is valid. + unsafe impl ByteValued for TestCap {} + + impl PciCapability for TestCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } + } + + #[test] + fn add_capability() { + let mut cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + None, + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + // Add two capabilities with different contents. + let cap1 = TestCap { len: 4, foo: 0xAA }; + let cap1_offset = cfg.add_capability(&cap1).unwrap(); + assert_eq!(cap1_offset % 4, 0); + + let cap2 = TestCap { + len: 0x04, + foo: 0x55, + }; + let cap2_offset = cfg.add_capability(&cap2).unwrap(); + assert_eq!(cap2_offset % 4, 0); + + // The capability list head should be pointing to cap1. + let cap_ptr = cfg.read_reg(CAPABILITY_LIST_HEAD_OFFSET / 4) & 0xFF; + assert_eq!(cap1_offset, cap_ptr as usize); + + // Verify the contents of the capabilities. + let cap1_data = cfg.read_reg(cap1_offset / 4); + assert_eq!(cap1_data & 0xFF, 0x09); // capability ID + assert_eq!((cap1_data >> 8) & 0xFF, cap2_offset as u32); // next capability pointer + assert_eq!((cap1_data >> 16) & 0xFF, 0x04); // cap1.len + assert_eq!((cap1_data >> 24) & 0xFF, 0xAA); // cap1.foo + + let cap2_data = cfg.read_reg(cap2_offset / 4); + assert_eq!(cap2_data & 0xFF, 0x09); // capability ID + assert_eq!((cap2_data >> 8) & 0xFF, 0x00); // next capability pointer + assert_eq!((cap2_data >> 16) & 0xFF, 0x04); // cap2.len + assert_eq!((cap2_data >> 24) & 0xFF, 0x55); // cap2.foo + } + + #[derive(Copy, Clone)] + enum TestPi { + Test = 0x5a, + } + + impl PciProgrammingInterface for TestPi { + fn get_register_value(&self) -> u8 { + *self as u8 + } + } + + #[test] + fn class_code() { + let cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + Some(&TestPi::Test), + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + let class_reg = cfg.read_reg(2); + let class_code = (class_reg >> 24) & 0xFF; + let subclass = (class_reg >> 16) & 0xFF; + let prog_if = (class_reg >> 8) & 0xFF; + assert_eq!(class_code, 0x04); + assert_eq!(subclass, 0x01); + assert_eq!(prog_if, 0x5a); + } +} diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs new file mode 100644 index 00000000000..fec1c149e2b --- /dev/null +++ b/src/pci/src/device.rs @@ -0,0 +1,137 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::fmt::{self, Display}; +use std::sync::{Arc, Barrier, Mutex}; +use std::{io, result}; + +use vm_system_allocator::{AddressAllocator, SystemAllocator}; +use vm_device::Resource; + +use crate::configuration::{self, PciBarRegionType}; +use crate::PciBarConfiguration; + +#[derive(Debug)] +pub enum Error { + /// Setup of the device capabilities failed. + CapabilitiesSetup(configuration::Error), + /// Allocating space for an IO BAR failed. + IoAllocationFailed(u64), + /// Registering an IO BAR failed. + IoRegistrationFailed(u64, configuration::Error), + /// Expected resource not found. + MissingResource, + /// Invalid resource. + InvalidResource(Resource), +} +pub type Result = std::result::Result; + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + + match self { + CapabilitiesSetup(e) => write!(f, "failed to add capability {e}"), + IoAllocationFailed(size) => { + write!(f, "failed to allocate space for an IO BAR, size={size}") + } + IoRegistrationFailed(addr, e) => { + write!(f, "failed to register an IO BAR, addr={addr} err={e}") + } + MissingResource => write!(f, "failed to find expected resource"), + InvalidResource(r) => write!(f, "invalid resource {r:?}"), + } + } +} + +#[derive(Clone, Copy)] +pub struct BarReprogrammingParams { + pub old_base: u64, + pub new_base: u64, + pub len: u64, + pub region_type: PciBarRegionType, +} + +pub trait PciDevice: Send { + /// Allocates the needed PCI BARs space using the `allocate` function which takes a size and + /// returns an address. Returns a Vec of (GuestAddress, GuestUsize) tuples. + fn allocate_bars( + &mut self, + _allocator: &Arc>, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + _resources: Option>, + ) -> Result> { + Ok(Vec::new()) + } + + /// Frees the PCI BARs previously allocated with a call to allocate_bars(). + fn free_bars( + &mut self, + _allocator: &mut SystemAllocator, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + ) -> Result<()> { + Ok(()) + } + + /// Sets a register in the configuration space. + /// * `reg_idx` - The index of the config register to modify. + /// * `offset` - Offset into the register. + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option>; + /// Gets a register from the configuration space. + /// * `reg_idx` - The index of the config register to read. + fn read_config_register(&mut self, reg_idx: usize) -> u32; + /// Detects if a BAR is being reprogrammed. + fn detect_bar_reprogramming( + &mut self, + _reg_idx: usize, + _data: &[u8], + ) -> Option { + None + } + /// Reads from a BAR region mapped into the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - Filled with the data from `addr`. + fn read_bar(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} + /// Writes to a BAR region mapped into the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - The data to write. + fn write_bar(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + /// Relocates the BAR to a different address in guest address space. + fn move_bar(&mut self, _old_base: u64, _new_base: u64) -> result::Result<(), io::Error> { + Ok(()) + } + /// Provides a mutable reference to the Any trait. This is useful to let + /// the caller have access to the underlying type behind the trait. + fn as_any(&mut self) -> &mut dyn Any; + + /// Optionally returns a unique identifier. + fn id(&self) -> Option; +} + +/// This trait defines a set of functions which can be triggered whenever a +/// PCI device is modified in any way. +pub trait DeviceRelocation: Send + Sync { + /// The BAR needs to be moved to a different location in the guest address + /// space. This follows a decision from the software running in the guest. + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + pci_dev: &mut dyn PciDevice, + region_type: PciBarRegionType, + ) -> result::Result<(), io::Error>; +} diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs new file mode 100644 index 00000000000..ece54d0bf80 --- /dev/null +++ b/src/pci/src/lib.rs @@ -0,0 +1,186 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Implements pci devices and busses. +#[macro_use] +extern crate log; + +mod bus; +pub mod configuration; +pub mod device; +pub mod msi; +pub mod msix; +pub mod vfio; + +use std::fmt::{self, Display}; +use std::num::ParseIntError; +use std::str::FromStr; + +use serde::de::Visitor; + +pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; +pub use self::configuration::{ + PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, + PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, + PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, + PCI_CONFIGURATION_ID, +}; +pub use self::device::{ + BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, +}; +pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; +pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE}; +pub use self::vfio::{MmioRegion, VfioDmaMapping, VfioPciDevice, VfioPciError}; + +/// PCI has four interrupt pins A->D. +#[derive(Copy, Clone)] +pub enum PciInterruptPin { + IntA, + IntB, + IntC, + IntD, +} + +impl PciInterruptPin { + pub fn to_mask(self) -> u32 { + self as u32 + } +} + +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT: u64 = 0xcf8; +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT_SIZE: u64 = 0x8; + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub struct PciBdf(u32); + +struct PciBdfVisitor; + +impl<'de> Visitor<'de> for PciBdfVisitor { + type Value = PciBdf; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("struct PciBdf") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(v.into()) + } +} + +impl<'de> serde::Deserialize<'de> for PciBdf { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_str(PciBdfVisitor) + } +} + +impl serde::Serialize for PciBdf { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.collect_str(&self.to_string()) + } +} + +impl PciBdf { + pub fn segment(&self) -> u16 { + ((self.0 >> 16) & 0xffff) as u16 + } + + pub fn bus(&self) -> u8 { + ((self.0 >> 8) & 0xff) as u8 + } + + pub fn device(&self) -> u8 { + ((self.0 >> 3) & 0x1f) as u8 + } + + pub fn function(&self) -> u8 { + (self.0 & 0x7) as u8 + } + + pub fn new(segment: u16, bus: u8, device: u8, function: u8) -> Self { + Self( + (segment as u32) << 16 + | (bus as u32) << 8 + | ((device & 0x1f) as u32) << 3 + | (function & 0x7) as u32, + ) + } +} + +impl From for PciBdf { + fn from(bdf: u32) -> Self { + Self(bdf) + } +} + +impl From for u32 { + fn from(bdf: PciBdf) -> Self { + bdf.0 + } +} + +impl From<&PciBdf> for u32 { + fn from(bdf: &PciBdf) -> Self { + bdf.0 + } +} + +impl From for u16 { + fn from(bdf: PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl From<&PciBdf> for u16 { + fn from(bdf: &PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl Display for PciBdf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:01x}", + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } +} + +impl FromStr for PciBdf { + type Err = ParseIntError; + + fn from_str(s: &str) -> Result { + let items: Vec<&str> = s.split('.').collect(); + assert_eq!(items.len(), 2); + let function = u8::from_str_radix(items[1], 16)?; + let items: Vec<&str> = items[0].split(':').collect(); + assert_eq!(items.len(), 3); + let segment = u16::from_str_radix(items[0], 16)?; + let bus = u8::from_str_radix(items[1], 16)?; + let device = u8::from_str_radix(items[2], 16)?; + Ok(PciBdf::new(segment, bus, device, function)) + } +} + +impl From<&str> for PciBdf { + fn from(bdf: &str) -> Self { + Self::from_str(bdf).unwrap() + } +} diff --git a/src/pci/src/msi.rs b/src/pci/src/msi.rs new file mode 100644 index 00000000000..c8b41e68823 --- /dev/null +++ b/src/pci/src/msi.rs @@ -0,0 +1,284 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use std::io; +use std::sync::Arc; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; + +// MSI control masks +const MSI_CTL_ENABLE: u16 = 0x1; +const MSI_CTL_MULTI_MSG_ENABLE: u16 = 0x70; +const MSI_CTL_64_BITS: u16 = 0x80; +const MSI_CTL_PER_VECTOR: u16 = 0x100; + +// MSI message offsets +const MSI_MSG_CTL_OFFSET: u64 = 0x2; +const MSI_MSG_ADDR_LO_OFFSET: u64 = 0x4; + +// MSI message masks +const MSI_MSG_ADDR_LO_MASK: u32 = 0xffff_fffc; + +pub fn msi_num_enabled_vectors(msg_ctl: u16) -> usize { + let field = (msg_ctl >> 4) & 0x7; + + if field > 5 { + return 0; + } + + 1 << field +} + +#[derive(Error, Debug)] +pub enum Error { + #[error("Failed enabling the interrupt route: {0}")] + EnableInterruptRoute(io::Error), + #[error("Failed updating the interrupt route: {0}")] + UpdateInterruptRoute(io::Error), +} + +pub const MSI_CONFIG_ID: &str = "msi_config"; + +#[derive(Clone, Copy, Default, Serialize, Deserialize)] +pub struct MsiCap { + // Message Control Register + // 0: MSI enable. + // 3-1; Multiple message capable. + // 6-4: Multiple message enable. + // 7: 64 bits address capable. + // 8: Per-vector masking capable. + // 15-9: Reserved. + pub msg_ctl: u16, + // Message Address (LSB) + // 1-0: Reserved. + // 31-2: Message address. + pub msg_addr_lo: u32, + // Message Upper Address (MSB) + // 31-0: Message address. + pub msg_addr_hi: u32, + // Message Data + // 15-0: Message data. + pub msg_data: u16, + // Mask Bits + // 31-0: Mask bits. + pub mask_bits: u32, + // Pending Bits + // 31-0: Pending bits. + pub pending_bits: u32, +} + +impl MsiCap { + fn addr_64_bits(&self) -> bool { + self.msg_ctl & MSI_CTL_64_BITS == MSI_CTL_64_BITS + } + + fn per_vector_mask(&self) -> bool { + self.msg_ctl & MSI_CTL_PER_VECTOR == MSI_CTL_PER_VECTOR + } + + fn enabled(&self) -> bool { + self.msg_ctl & MSI_CTL_ENABLE == MSI_CTL_ENABLE + } + + fn num_enabled_vectors(&self) -> usize { + msi_num_enabled_vectors(self.msg_ctl) + } + + fn vector_masked(&self, vector: usize) -> bool { + if !self.per_vector_mask() { + return false; + } + + (self.mask_bits >> vector) & 0x1 == 0x1 + } + + fn size(&self) -> u64 { + let mut size: u64 = 0xa; + + if self.addr_64_bits() { + size += 0x4; + } + if self.per_vector_mask() { + size += 0xa; + } + + size + } + + fn update(&mut self, offset: u64, data: &[u8]) { + // Calculate message data offset depending on the address being 32 or + // 64 bits. + // Calculate upper address offset if the address is 64 bits. + // Calculate mask bits offset based on the address being 32 or 64 bits + // and based on the per vector masking being enabled or not. + let (msg_data_offset, addr_hi_offset, mask_bits_offset): (u64, Option, Option) = + if self.addr_64_bits() { + let mask_bits = if self.per_vector_mask() { + Some(0x10) + } else { + None + }; + (0xc, Some(0x8), mask_bits) + } else { + let mask_bits = if self.per_vector_mask() { + Some(0xc) + } else { + None + }; + (0x8, None, mask_bits) + }; + + // Update cache without overriding the read-only bits. + match data.len() { + 2 => { + let value = LittleEndian::read_u16(data); + match offset { + MSI_MSG_CTL_OFFSET => { + self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + | (value & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + } + x if x == msg_data_offset => self.msg_data = value, + _ => error!("invalid offset"), + } + } + 4 => { + let value = LittleEndian::read_u32(data); + match offset { + 0x0 => { + self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + | ((value >> 16) as u16 & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + } + MSI_MSG_ADDR_LO_OFFSET => self.msg_addr_lo = value & MSI_MSG_ADDR_LO_MASK, + x if x == msg_data_offset => self.msg_data = value as u16, + x if addr_hi_offset.is_some() && x == addr_hi_offset.unwrap() => { + self.msg_addr_hi = value + } + x if mask_bits_offset.is_some() && x == mask_bits_offset.unwrap() => { + self.mask_bits = value + } + _ => error!("invalid offset"), + } + } + _ => error!("invalid data length"), + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct MsiConfigState { + cap: MsiCap, +} + +pub struct MsiConfig { + pub cap: MsiCap, + interrupt_source_group: Arc, +} + +impl MsiConfig { + pub fn new( + msg_ctl: u16, + interrupt_source_group: Arc, + state: Option, + ) -> Result { + let cap = if let Some(state) = state { + if state.cap.enabled() { + for idx in 0..state.cap.num_enabled_vectors() { + let config = MsiIrqSourceConfig { + high_addr: state.cap.msg_addr_hi, + low_addr: state.cap.msg_addr_lo, + data: state.cap.msg_data as u32, + devid: 0, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.cap.vector_masked(idx), + false, + ) + .map_err(Error::UpdateInterruptRoute)?; + } + + interrupt_source_group + .set_gsi() + .map_err(Error::EnableInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + + state.cap + } else { + MsiCap { + msg_ctl, + ..Default::default() + } + }; + + Ok(MsiConfig { + cap, + interrupt_source_group, + }) + } + + fn state(&self) -> MsiConfigState { + MsiConfigState { cap: self.cap } + } + + pub fn enabled(&self) -> bool { + self.cap.enabled() + } + + pub fn size(&self) -> u64 { + self.cap.size() + } + + pub fn num_enabled_vectors(&self) -> usize { + self.cap.num_enabled_vectors() + } + + pub fn update(&mut self, offset: u64, data: &[u8]) { + let old_enabled = self.cap.enabled(); + + self.cap.update(offset, data); + + if self.cap.enabled() { + for idx in 0..self.num_enabled_vectors() { + let config = MsiIrqSourceConfig { + high_addr: self.cap.msg_addr_hi, + low_addr: self.cap.msg_addr_lo, + data: self.cap.msg_data as u32, + devid: 0, + }; + + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + self.cap.vector_masked(idx), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + + if !old_enabled { + if let Err(e) = self.interrupt_source_group.enable() { + error!("Failed enabling irq_fd: {:?}", e); + } + } + } else if old_enabled { + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } +} diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs new file mode 100644 index 00000000000..c393085667c --- /dev/null +++ b/src/pci/src/msix.rs @@ -0,0 +1,552 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use std::sync::Arc; +use std::{io, result}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; +use vm_memory::ByteValued; + +use crate::{PciCapability, PciCapabilityId}; + +const MAX_MSIX_VECTORS_PER_DEVICE: u16 = 2048; +const MSIX_TABLE_ENTRIES_MODULO: u64 = 16; +const MSIX_PBA_ENTRIES_MODULO: u64 = 8; +const BITS_PER_PBA_ENTRY: usize = 64; +const FUNCTION_MASK_BIT: u8 = 14; +const MSIX_ENABLE_BIT: u8 = 15; +const FUNCTION_MASK_MASK: u16 = (1 << FUNCTION_MASK_BIT) as u16; +const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; +pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; +pub const MSIX_CONFIG_ID: &str = "msix_config"; + +#[derive(Debug)] +pub enum Error { + /// Failed enabling the interrupt route. + EnableInterruptRoute(io::Error), + /// Failed updating the interrupt route. + UpdateInterruptRoute(io::Error), +} + +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +pub struct MsixTableEntry { + pub msg_addr_lo: u32, + pub msg_addr_hi: u32, + pub msg_data: u32, + pub vector_ctl: u32, +} + +impl MsixTableEntry { + pub fn masked(&self) -> bool { + self.vector_ctl & 0x1 == 0x1 + } +} + +impl Default for MsixTableEntry { + fn default() -> Self { + MsixTableEntry { + msg_addr_lo: 0, + msg_addr_hi: 0, + msg_data: 0, + vector_ctl: 0x1, + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct MsixConfigState { + table_entries: Vec, + pba_entries: Vec, + masked: bool, + enabled: bool, +} + +pub struct MsixConfig { + pub table_entries: Vec, + pub pba_entries: Vec, + pub devid: u32, + interrupt_source_group: Arc, + masked: bool, + enabled: bool, +} + +impl MsixConfig { + pub fn new( + msix_vectors: u16, + interrupt_source_group: Arc, + devid: u32, + state: Option, + ) -> result::Result { + assert!(msix_vectors <= MAX_MSIX_VECTORS_PER_DEVICE); + + let (table_entries, pba_entries, masked, enabled) = if let Some(state) = state { + if state.enabled && !state.masked { + for (idx, table_entry) in state.table_entries.iter().enumerate() { + if table_entry.masked() { + continue; + } + + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.masked, + true, + ) + .map_err(Error::UpdateInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + } + + ( + state.table_entries, + state.pba_entries, + state.masked, + state.enabled, + ) + } else { + let mut table_entries: Vec = Vec::new(); + table_entries.resize_with(msix_vectors as usize, Default::default); + let mut pba_entries: Vec = Vec::new(); + let num_pba_entries: usize = ((msix_vectors as usize) / BITS_PER_PBA_ENTRY) + 1; + pba_entries.resize_with(num_pba_entries, Default::default); + + (table_entries, pba_entries, true, false) + }; + + Ok(MsixConfig { + table_entries, + pba_entries, + devid, + interrupt_source_group, + masked, + enabled, + }) + } + + fn state(&self) -> MsixConfigState { + MsixConfigState { + table_entries: self.table_entries.clone(), + pba_entries: self.pba_entries.clone(), + masked: self.masked, + enabled: self.enabled, + } + } + + pub fn masked(&self) -> bool { + self.masked + } + + pub fn enabled(&self) -> bool { + self.enabled + } + + pub fn set_msg_ctl(&mut self, reg: u16) { + let old_masked = self.masked; + let old_enabled = self.enabled; + + self.masked = ((reg >> FUNCTION_MASK_BIT) & 1u16) == 1u16; + self.enabled = ((reg >> MSIX_ENABLE_BIT) & 1u16) == 1u16; + + // Update interrupt routing + if old_masked != self.masked || old_enabled != self.enabled { + if self.enabled && !self.masked { + debug!("MSI-X enabled for device 0x{:x}", self.devid); + for (idx, table_entry) in self.table_entries.iter().enumerate() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + } else if old_enabled || !old_masked { + debug!("MSI-X disabled for device 0x{:x}", self.devid); + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } + + // If the Function Mask bit was set, and has just been cleared, it's + // important to go through the entire PBA to check if there was any + // pending MSI-X message to inject, given that the vector is not + // masked. + if old_masked && !self.masked { + for (index, entry) in self.table_entries.clone().iter().enumerate() { + if !entry.masked() && self.get_pba_bit(index as u16) == 1 { + self.inject_msix_and_clear_pba(index); + } + } + } + } + + pub fn read_table(&self, offset: u64, data: &mut [u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + if index >= self.table_entries.len() { + debug!("Invalid MSI-X table entry index {index}"); + data.copy_from_slice(&[0xff; 8][..data.len()]); + return; + } + + match data.len() { + 4 => { + let value = match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo, + 0x4 => self.table_entries[index].msg_addr_hi, + 0x8 => self.table_entries[index].msg_data, + 0xc => self.table_entries[index].vector_ctl, + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u32(data, value); + } + 8 => { + let value = match modulo_offset { + 0x0 => { + (u64::from(self.table_entries[index].msg_addr_hi) << 32) + | u64::from(self.table_entries[index].msg_addr_lo) + } + 0x8 => { + (u64::from(self.table_entries[index].vector_ctl) << 32) + | u64::from(self.table_entries[index].msg_data) + } + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u64(data, value); + } + _ => { + error!("invalid data length"); + } + } + } + + pub fn write_table(&mut self, offset: u64, data: &[u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + if index >= self.table_entries.len() { + debug!("Invalid MSI-X table entry index {index}"); + return; + } + + // Store the value of the entry before modification + let old_entry = self.table_entries[index].clone(); + + match data.len() { + 4 => { + let value = LittleEndian::read_u32(data); + match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo = value, + 0x4 => self.table_entries[index].msg_addr_hi = value, + 0x8 => self.table_entries[index].msg_data = value, + 0xc => { + self.table_entries[index].vector_ctl = value; + } + _ => error!("invalid offset"), + }; + + debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); + } + 8 => { + let value = LittleEndian::read_u64(data); + match modulo_offset { + 0x0 => { + self.table_entries[index].msg_addr_lo = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].msg_addr_hi = (value >> 32) as u32; + } + 0x8 => { + self.table_entries[index].msg_data = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].vector_ctl = (value >> 32) as u32; + } + _ => error!("invalid offset"), + }; + + debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); + } + _ => error!("invalid data length"), + }; + + let table_entry = &self.table_entries[index]; + + // Optimisation to avoid excessive updates + if &old_entry == table_entry { + return; + } + + // Update interrupt routes + // Optimisation: only update routes if the entry is not masked; + // this is safe because if the entry is masked (starts masked as per spec) + // in the table then it won't be triggered. (See: #4273) + if self.enabled && !self.masked && !table_entry.masked() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + index as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + + // After the MSI-X table entry has been updated, it is necessary to + // check if the vector control masking bit has changed. In case the + // bit has been flipped from 1 to 0, we need to inject a MSI message + // if the corresponding pending bit from the PBA is set. Once the MSI + // has been injected, the pending bit in the PBA needs to be cleared. + // All of this is valid only if MSI-X has not been masked for the whole + // device. + + // Check if bit has been flipped + if !self.masked() + && self.enabled() + && old_entry.masked() + && !table_entry.masked() + && self.get_pba_bit(index as u16) == 1 + { + self.inject_msix_and_clear_pba(index); + } + } + + pub fn read_pba(&mut self, offset: u64, data: &mut [u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO; + + if index >= self.pba_entries.len() { + debug!("Invalid MSI-X PBA entry index {index}"); + data.copy_from_slice(&[0xff; 8][..data.len()]); + return; + } + + match data.len() { + 4 => { + let value: u32 = match modulo_offset { + 0x0 => (self.pba_entries[index] & 0xffff_ffffu64) as u32, + 0x4 => (self.pba_entries[index] >> 32) as u32, + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u32(data, value); + } + 8 => { + let value: u64 = match modulo_offset { + 0x0 => self.pba_entries[index], + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u64(data, value); + } + _ => { + error!("invalid data length"); + } + } + } + + pub fn write_pba(&mut self, _offset: u64, _data: &[u8]) { + error!("Pending Bit Array is read only"); + } + + pub fn set_pba_bit(&mut self, vector: u16, reset: bool) { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + let mut mask: u64 = (1 << shift) as u64; + + if reset { + mask = !mask; + self.pba_entries[index] &= mask; + } else { + self.pba_entries[index] |= mask; + } + } + + fn get_pba_bit(&self, vector: u16) -> u8 { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + + ((self.pba_entries[index] >> shift) & 0x0000_0001u64) as u8 + } + + fn inject_msix_and_clear_pba(&mut self, vector: usize) { + // Inject the MSI message + match self + .interrupt_source_group + .trigger(vector as InterruptIndex) + { + Ok(_) => debug!("MSI-X injected on vector control flip"), + Err(e) => error!("failed to inject MSI-X: {}", e), + } + + // Clear the bit from PBA + self.set_pba_bit(vector as u16, true); + } +} + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Default, Serialize, Deserialize)] +pub struct MsixCap { + // Message Control Register + // 10-0: MSI-X Table size + // 13-11: Reserved + // 14: Mask. Mask all MSI-X when set. + // 15: Enable. Enable all MSI-X when set. + pub msg_ctl: u16, + // Table. Contains the offset and the BAR indicator (BIR) + // 2-0: Table BAR indicator (BIR). Can be 0 to 5. + // 31-3: Table offset in the BAR pointed by the BIR. + pub table: u32, + // Pending Bit Array. Contains the offset and the BAR indicator (BIR) + // 2-0: PBA BAR indicator (BIR). Can be 0 to 5. + // 31-3: PBA offset in the BAR pointed by the BIR. + pub pba: u32, +} + +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for MsixCap {} + +impl PciCapability for MsixCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::MsiX + } +} + +impl MsixCap { + pub fn new( + table_pci_bar: u8, + table_size: u16, + table_off: u32, + pba_pci_bar: u8, + pba_off: u32, + ) -> Self { + assert!(table_size < MAX_MSIX_VECTORS_PER_DEVICE); + + // Set the table size and enable MSI-X. + let msg_ctl: u16 = 0x8000u16 + table_size - 1; + + MsixCap { + msg_ctl, + table: (table_off & 0xffff_fff8u32) | u32::from(table_pci_bar & 0x7u8), + pba: (pba_off & 0xffff_fff8u32) | u32::from(pba_pci_bar & 0x7u8), + } + } + + pub fn set_msg_ctl(&mut self, data: u16) { + self.msg_ctl = (self.msg_ctl & !(FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)) + | (data & (FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)); + } + + pub fn masked(&self) -> bool { + (self.msg_ctl >> FUNCTION_MASK_BIT) & 0x1 == 0x1 + } + + pub fn enabled(&self) -> bool { + (self.msg_ctl >> MSIX_ENABLE_BIT) & 0x1 == 0x1 + } + + pub fn table_offset(&self) -> u32 { + self.table & 0xffff_fff8 + } + + pub fn pba_offset(&self) -> u32 { + self.pba & 0xffff_fff8 + } + + pub fn table_set_offset(&mut self, addr: u32) { + self.table &= 0x7; + self.table += addr; + } + + pub fn pba_set_offset(&mut self, addr: u32) { + self.pba &= 0x7; + self.pba += addr; + } + + pub fn table_bir(&self) -> u32 { + self.table & 0x7 + } + + pub fn pba_bir(&self) -> u32 { + self.pba & 0x7 + } + + pub fn table_size(&self) -> u16 { + (self.msg_ctl & 0x7ff) + 1 + } + + pub fn table_range(&self) -> (u64, u64) { + // The table takes 16 bytes per entry. + let size = self.table_size() as u64 * 16; + (self.table_offset() as u64, size) + } + + pub fn pba_range(&self) -> (u64, u64) { + // The table takes 1 bit per entry modulo 8 bytes. + let size = ((self.table_size() as u64 / 64) + 1) * 8; + (self.pba_offset() as u64, size) + } +} diff --git a/src/pci/src/vfio.rs b/src/pci/src/vfio.rs new file mode 100644 index 00000000000..de7a962b7d7 --- /dev/null +++ b/src/pci/src/vfio.rs @@ -0,0 +1,1960 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use core::fmt; +use std::any::Any; +use std::collections::{BTreeMap, HashMap}; +use std::fmt::{Debug, Formatter}; +use std::io; +use std::os::unix::io::AsRawFd; +use std::ptr::null_mut; +use std::sync::{Arc, Barrier, Mutex}; + +use anyhow::{anyhow, Error}; +use byteorder::{ByteOrder, LittleEndian}; +use kvm_ioctls::VmFd; +use libc::{sysconf, _SC_PAGESIZE}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vfio_bindings::bindings::vfio::*; +use vfio_ioctls::{ + VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, +}; +use vm_system_allocator::page_size::{ + align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, +}; +use vm_system_allocator::{AddressAllocator, SystemAllocator}; +use vm_device::dma_mapping::ExternalDmaMapping; +use vm_device::interrupt::{ + InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, +}; +use vm_device::Resource; +use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsize}; +use vmm_sys_util::eventfd::EventFd; + +use crate::msi::{MsiConfigState, MSI_CONFIG_ID}; +use crate::msix::MsixConfigState; +use crate::{ + msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, + PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, + PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, + PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID, +}; + +pub use kvm_bindings::kvm_userspace_memory_region as MemoryRegion; + +pub(crate) const VFIO_COMMON_ID: &str = "vfio_common"; + +#[derive(Debug, Error)] +pub enum VfioPciError { + #[error("Failed to create user memory region: {0}")] + MapRegionGuest(#[source] Error), + #[error("Failed to DMA map: {0}")] + DmaMap(#[source] vfio_ioctls::VfioError), + #[error("Failed to DMA unmap: {0}")] + DmaUnmap(#[source] vfio_ioctls::VfioError), + #[error("Failed to enable INTx: {0}")] + EnableIntx(#[source] VfioError), + #[error("Failed to enable MSI: {0}")] + EnableMsi(#[source] VfioError), + #[error("Failed to enable MSI-x: {0}")] + EnableMsix(#[source] VfioError), + #[error("Failed to mmap the area")] + MmapArea, + #[error("Failed to notifier's eventfd")] + MissingNotifier, + #[error("Invalid region alignment")] + RegionAlignment, + #[error("Invalid region size")] + RegionSize, + #[error("Failed to retrieve MsiConfigState: {0}")] + RetrieveMsiConfigState(#[source] anyhow::Error), + #[error("Failed to retrieve MsixConfigState: {0}")] + RetrieveMsixConfigState(#[source] anyhow::Error), + #[error("Failed to retrieve PciConfigurationState: {0}")] + RetrievePciConfigurationState(#[source] anyhow::Error), + #[error("Failed to retrieve VfioCommonState: {0}")] + RetrieveVfioCommonState(#[source] anyhow::Error), +} + +#[derive(Copy, Clone)] +enum PciVfioSubclass { + VfioSubclass = 0xff, +} + +impl PciSubclass for PciVfioSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +enum InterruptUpdateAction { + EnableMsi, + DisableMsi, + EnableMsix, + DisableMsix, +} + +#[derive(Serialize, Deserialize)] +struct IntxState { + enabled: bool, +} + +pub(crate) struct VfioIntx { + interrupt_source_group: Arc, + enabled: bool, +} + +#[derive(Serialize, Deserialize)] +struct MsiState { + cap: MsiCap, + cap_offset: u32, +} + +pub(crate) struct VfioMsi { + pub(crate) cfg: MsiConfig, + cap_offset: u32, + interrupt_source_group: Arc, +} + +impl VfioMsi { + fn update(&mut self, offset: u64, data: &[u8]) -> Option { + let old_enabled = self.cfg.enabled(); + + self.cfg.update(offset, data); + + let new_enabled = self.cfg.enabled(); + + if !old_enabled && new_enabled { + return Some(InterruptUpdateAction::EnableMsi); + } + + if old_enabled && !new_enabled { + return Some(InterruptUpdateAction::DisableMsi); + } + + None + } +} + +#[derive(Serialize, Deserialize)] +struct MsixState { + cap: MsixCap, + cap_offset: u32, + bdf: u32, +} + +pub(crate) struct VfioMsix { + pub(crate) bar: MsixConfig, + cap: MsixCap, + cap_offset: u32, + interrupt_source_group: Arc, +} + +impl VfioMsix { + fn update(&mut self, offset: u64, data: &[u8]) -> Option { + let old_enabled = self.bar.enabled(); + + // Update "Message Control" word + if offset == 2 && data.len() == 2 { + self.bar.set_msg_ctl(LittleEndian::read_u16(data)); + } + + let new_enabled = self.bar.enabled(); + + if !old_enabled && new_enabled { + return Some(InterruptUpdateAction::EnableMsix); + } + + if old_enabled && !new_enabled { + return Some(InterruptUpdateAction::DisableMsix); + } + + None + } + + fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { + let table_offset: u64 = u64::from(self.cap.table_offset()); + let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); + let table_bir: u32 = self.cap.table_bir(); + + bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size + } +} + +pub(crate) struct Interrupt { + pub(crate) intx: Option, + pub(crate) msi: Option, + pub(crate) msix: Option, +} + +impl Interrupt { + fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option { + if let Some(ref mut msi) = &mut self.msi { + let action = msi.update(offset, data); + return action; + } + + None + } + + fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option { + if let Some(ref mut msix) = &mut self.msix { + let action = msix.update(offset, data); + return action; + } + + None + } + + fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { + if let Some(msi) = &self.msi { + if offset >= u64::from(msi.cap_offset) + && offset < u64::from(msi.cap_offset) + msi.cfg.size() + { + return Some(( + PciCapabilityId::MessageSignalledInterrupts, + u64::from(msi.cap_offset), + )); + } + } + + if let Some(msix) = &self.msix { + if offset == u64::from(msix.cap_offset) { + return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); + } + } + + None + } + + fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { + if let Some(msix) = &self.msix { + return msix.table_accessed(bar_index, offset); + } + + false + } + + fn msix_write_table(&mut self, offset: u64, data: &[u8]) { + if let Some(ref mut msix) = &mut self.msix { + let offset = offset - u64::from(msix.cap.table_offset()); + msix.bar.write_table(offset, data) + } + } + + fn msix_read_table(&self, offset: u64, data: &mut [u8]) { + if let Some(msix) = &self.msix { + let offset = offset - u64::from(msix.cap.table_offset()); + msix.bar.read_table(offset, data) + } + } + + pub(crate) fn intx_in_use(&self) -> bool { + if let Some(intx) = &self.intx { + return intx.enabled; + } + + false + } +} + +#[derive(Copy, Clone)] +pub struct UserMemoryRegion { + pub slot: u32, + pub start: u64, + pub size: u64, + pub host_addr: u64, +} + +#[derive(Clone)] +pub struct MmioRegion { + pub start: GuestAddress, + pub length: GuestUsize, + pub(crate) type_: PciBarRegionType, + pub(crate) index: u32, + pub(crate) user_memory_regions: Vec, +} + +trait MmioRegionRange { + fn check_range(&self, guest_addr: u64, size: u64) -> bool; + fn find_user_address(&self, guest_addr: u64) -> Result; +} + +impl MmioRegionRange for Vec { + // Check if a guest address is within the range of mmio regions + fn check_range(&self, guest_addr: u64, size: u64) -> bool { + for region in self.iter() { + let Some(guest_addr_end) = guest_addr.checked_add(size) else { + return false; + }; + let Some(region_end) = region.start.raw_value().checked_add(region.length) else { + return false; + }; + if guest_addr >= region.start.raw_value() && guest_addr_end <= region_end { + return true; + } + } + false + } + + // Locate the user region address for a guest address within all mmio regions + fn find_user_address(&self, guest_addr: u64) -> Result { + for region in self.iter() { + for user_region in region.user_memory_regions.iter() { + if guest_addr >= user_region.start + && guest_addr < user_region.start + user_region.size + { + return Ok(user_region.host_addr + (guest_addr - user_region.start)); + } + } + } + + Err(io::Error::new( + io::ErrorKind::Other, + format!("unable to find user address: 0x{guest_addr:x}"), + )) + } +} + +#[derive(Debug, Error)] +pub enum VfioError { + #[error("Kernel VFIO error: {0}")] + KernelVfio(#[source] vfio_ioctls::VfioError), +} + +pub(crate) trait Vfio: Send + Sync { + fn read_config_byte(&self, offset: u32) -> u8 { + let mut data: [u8; 1] = [0]; + self.read_config(offset, &mut data); + data[0] + } + + fn read_config_word(&self, offset: u32) -> u16 { + let mut data: [u8; 2] = [0, 0]; + self.read_config(offset, &mut data); + u16::from_le_bytes(data) + } + + fn read_config_dword(&self, offset: u32) -> u32 { + let mut data: [u8; 4] = [0, 0, 0, 0]; + self.read_config(offset, &mut data); + u32::from_le_bytes(data) + } + + fn write_config_dword(&self, offset: u32, buf: u32) { + let data: [u8; 4] = buf.to_le_bytes(); + self.write_config(offset, &data) + } + + fn read_config(&self, offset: u32, data: &mut [u8]) { + self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); + } + + fn write_config(&self, offset: u32, data: &[u8]) { + self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) + } + + fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { + self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) + } + + fn disable_msi(&self) -> Result<(), VfioError> { + self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) + } + + fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { + self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) + } + + fn disable_msix(&self) -> Result<(), VfioError> { + self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) + } + + fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { + unimplemented!() + } + + fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { + unimplemented!() + } + + fn get_irq_info(&self, _irq_index: u32) -> Option { + unimplemented!() + } + + fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { + unimplemented!() + } + + fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { + unimplemented!() + } + + fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { + unimplemented!() + } +} + +struct VfioDeviceWrapper { + device: Arc, +} + +impl VfioDeviceWrapper { + fn new(device: Arc) -> Self { + Self { device } + } +} + +impl Vfio for VfioDeviceWrapper { + fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { + self.device.region_read(index, data, offset) + } + + fn region_write(&self, index: u32, offset: u64, data: &[u8]) { + self.device.region_write(index, data, offset) + } + + fn get_irq_info(&self, irq_index: u32) -> Option { + self.device.get_irq_info(irq_index).copied() + } + + fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { + self.device + .enable_irq(irq_index, event_fds) + .map_err(VfioError::KernelVfio) + } + + fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { + self.device + .disable_irq(irq_index) + .map_err(VfioError::KernelVfio) + } + + fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { + self.device + .unmask_irq(irq_index) + .map_err(VfioError::KernelVfio) + } +} + +#[derive(Serialize, Deserialize)] +struct VfioCommonState { + intx_state: Option, + msi_state: Option, + msix_state: Option, +} + +pub(crate) struct ConfigPatch { + mask: u32, + patch: u32, +} + +pub(crate) struct VfioCommon { + pub(crate) configuration: PciConfiguration, + pub(crate) mmio_regions: Vec, + pub(crate) interrupt: Interrupt, + pub(crate) msi_interrupt_manager: Arc>, + pub(crate) legacy_interrupt_group: Option>, + pub(crate) vfio_wrapper: Arc, + pub(crate) patches: HashMap, + x_nv_gpudirect_clique: Option, +} + +impl VfioCommon { + pub(crate) fn new( + msi_interrupt_manager: Arc>, + legacy_interrupt_group: Option>, + vfio_wrapper: Arc, + subclass: &dyn PciSubclass, + bdf: PciBdf, + x_nv_gpudirect_clique: Option, + ) -> Result { + let pci_configuration_state = None; + + let configuration = PciConfiguration::new( + 0, + 0, + 0, + PciClassCode::Other, + subclass, + None, + PciHeaderType::Device, + 0, + 0, + None, + pci_configuration_state, + ); + + let mut vfio_common = VfioCommon { + mmio_regions: Vec::new(), + configuration, + interrupt: Interrupt { + intx: None, + msi: None, + msix: None, + }, + msi_interrupt_manager, + legacy_interrupt_group, + vfio_wrapper, + patches: HashMap::new(), + x_nv_gpudirect_clique, + }; + + let state: Option = None; + let msi_state = None; + let msix_state = None; + + if let Some(state) = state.as_ref() { + vfio_common.set_state(state, msi_state, msix_state)?; + } else { + vfio_common.parse_capabilities(bdf); + vfio_common.initialize_legacy_interrupt()?; + } + + Ok(vfio_common) + } + + /// In case msix table offset is not page size aligned, we need do some fixup to achieve it. + /// Because we don't want the MMIO RW region and trap region overlap each other. + fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 { + if let Some(msix) = self.interrupt.msix.as_mut() { + let msix_cap = &mut msix.cap; + + // Suppose table_bir equals to pba_bir here. Am I right? + let (table_offset, table_size) = msix_cap.table_range(); + if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id { + return region_size; + } + + let (pba_offset, pba_size) = msix_cap.pba_range(); + let msix_sz = align_page_size_up(table_size + pba_size); + // Expand region to hold RW and trap region which both page size aligned + let size = std::cmp::max(region_size * 2, msix_sz * 2); + // let table starts from the middle of the region + msix_cap.table_set_offset((size / 2) as u32); + msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32); + + size + } else { + // MSI-X not supported for this device + region_size + } + } + + // The `allocator` argument is unused on `aarch64` + #[allow(unused_variables)] + pub(crate) fn allocate_bars( + &mut self, + allocator: &Arc>, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> Result, PciDeviceError> { + let mut bars = Vec::new(); + let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX; + + // Going through all regular regions to compute the BAR size. + // We're not saving the BAR address to restore it, because we + // are going to allocate a guest address for each BAR and write + // that new address back. + while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { + let mut region_size: u64 = 0; + let mut region_type = PciBarRegionType::Memory32BitRegion; + let mut prefetchable = PciBarPrefetchable::NotPrefetchable; + let mut flags: u32 = 0; + + let mut restored_bar_addr = None; + if let Some(resources) = &resources { + for resource in resources { + if let Resource::PciBar { + index, + base, + size, + type_, + .. + } = resource + { + if *index == bar_id as usize { + restored_bar_addr = Some(GuestAddress(*base)); + region_size = *size; + region_type = PciBarRegionType::from(*type_); + break; + } + } + } + if restored_bar_addr.is_none() { + bar_id += 1; + continue; + } + } else { + let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { + (PCI_ROM_EXP_BAR_INDEX * 4) as u32 + } else { + PCI_CONFIG_BAR_OFFSET + bar_id * 4 + }; + + // First read flags + flags = self.vfio_wrapper.read_config_dword(bar_offset); + + // Is this an IO BAR? + let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { + matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) + } else { + false + }; + + // Is this a 64-bit BAR? + let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { + matches!( + flags & PCI_CONFIG_MEMORY_BAR_64BIT, + PCI_CONFIG_MEMORY_BAR_64BIT + ) + } else { + false + }; + + if matches!( + flags & PCI_CONFIG_BAR_PREFETCHABLE, + PCI_CONFIG_BAR_PREFETCHABLE + ) { + prefetchable = PciBarPrefetchable::Prefetchable + }; + + // To get size write all 1s + self.vfio_wrapper + .write_config_dword(bar_offset, 0xffff_ffff); + + // And read back BAR value. The device will write zeros for bits it doesn't care about + let mut lower = self.vfio_wrapper.read_config_dword(bar_offset); + + if io_bar { + // Mask flag bits (lowest 2 for I/O bars) + lower &= !0b11; + + // BAR is not enabled + if lower == 0 { + bar_id += 1; + continue; + } + + // IO BAR + region_type = PciBarRegionType::IoRegion; + + // Invert bits and add 1 to calculate size + region_size = (!lower + 1) as u64; + } else if is_64bit_bar { + // 64 bits Memory BAR + region_type = PciBarRegionType::Memory64BitRegion; + + // Query size of upper BAR of 64-bit BAR + let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; + self.vfio_wrapper + .write_config_dword(upper_offset, 0xffff_ffff); + let upper = self.vfio_wrapper.read_config_dword(upper_offset); + + let mut combined_size = u64::from(upper) << 32 | u64::from(lower); + + // Mask out flag bits (lowest 4 for memory bars) + combined_size &= !0b1111; + + // BAR is not enabled + if combined_size == 0 { + bar_id += 1; + continue; + } + + // Invert and add 1 to to find size + region_size = !combined_size + 1; + } else { + region_type = PciBarRegionType::Memory32BitRegion; + + // Mask out flag bits (lowest 4 for memory bars) + lower &= !0b1111; + + if lower == 0 { + bar_id += 1; + continue; + } + + // Invert and add 1 to to find size + region_size = (!lower + 1) as u64; + } + } + + let bar_addr = match region_type { + PciBarRegionType::IoRegion => { + #[cfg(not(target_arch = "x86_64"))] + unimplemented!(); + + // The address needs to be 4 bytes aligned. + #[cfg(target_arch = "x86_64")] + allocator + .lock() + .unwrap() + .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) + .ok_or(PciDeviceError::IoAllocationFailed(region_size))? + } + PciBarRegionType::Memory32BitRegion => { + // BAR allocation must be naturally aligned + mmio32_allocator + .allocate(restored_bar_addr, region_size, Some(region_size)) + .ok_or(PciDeviceError::IoAllocationFailed(region_size))? + } + PciBarRegionType::Memory64BitRegion => { + // We need do some fixup to keep MMIO RW region and msix cap region page size + // aligned. + region_size = self.fixup_msix_region(bar_id, region_size); + mmio64_allocator + .allocate( + restored_bar_addr, + region_size, + Some(std::cmp::max( + // SAFETY: FFI call. Trivially safe. + unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }, + region_size, + )), + ) + .ok_or(PciDeviceError::IoAllocationFailed(region_size))? + } + }; + + // We can now build our BAR configuration block. + let bar = PciBarConfiguration::default() + .set_index(bar_id as usize) + .set_address(bar_addr.raw_value()) + .set_size(region_size) + .set_region_type(region_type) + .set_prefetchable(prefetchable); + + if bar_id == VFIO_PCI_ROM_REGION_INDEX { + self.configuration + .add_pci_rom_bar(&bar, flags & 0x1) + .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; + } else { + self.configuration + .add_pci_bar(&bar) + .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; + } + + bars.push(bar); + self.mmio_regions.push(MmioRegion { + start: bar_addr, + length: region_size, + type_: region_type, + index: bar_id, + user_memory_regions: Vec::new(), + }); + + bar_id += 1; + if region_type == PciBarRegionType::Memory64BitRegion { + bar_id += 1; + } + } + + Ok(bars) + } + + // The `allocator` argument is unused on `aarch64` + #[allow(unused_variables)] + pub(crate) fn free_bars( + &mut self, + allocator: &mut SystemAllocator, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> Result<(), PciDeviceError> { + for region in self.mmio_regions.iter() { + match region.type_ { + PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] + allocator.free_io_addresses(region.start, region.length); + #[cfg(not(target_arch = "x86_64"))] + error!("I/O region is not supported"); + } + PciBarRegionType::Memory32BitRegion => { + mmio32_allocator.free(region.start, region.length); + } + PciBarRegionType::Memory64BitRegion => { + mmio64_allocator.free(region.start, region.length); + } + } + } + Ok(()) + } + + pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap { + let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into()); + + let table = self.vfio_wrapper.read_config_dword((cap + 4).into()); + + let pba = self.vfio_wrapper.read_config_dword((cap + 8).into()); + + MsixCap { + msg_ctl, + table, + pba, + } + } + + pub(crate) fn initialize_msix( + &mut self, + msix_cap: MsixCap, + cap_offset: u32, + bdf: PciBdf, + state: Option, + ) { + let interrupt_source_group = self + .msi_interrupt_manager + .create_group(MsiIrqGroupConfig { + base: 0, + count: msix_cap.table_size() as InterruptIndex, + }) + .unwrap(); + + let msix_config = MsixConfig::new( + msix_cap.table_size(), + interrupt_source_group.clone(), + bdf.into(), + state, + ) + .unwrap(); + + self.interrupt.msix = Some(VfioMsix { + bar: msix_config, + cap: msix_cap, + cap_offset, + interrupt_source_group, + }); + } + + pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 { + self.vfio_wrapper.read_config_word((cap + 2).into()) + } + + pub(crate) fn initialize_msi( + &mut self, + msg_ctl: u16, + cap_offset: u32, + state: Option, + ) { + let interrupt_source_group = self + .msi_interrupt_manager + .create_group(MsiIrqGroupConfig { + base: 0, + count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, + }) + .unwrap(); + + let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap(); + + self.interrupt.msi = Some(VfioMsi { + cfg: msi_config, + cap_offset, + interrupt_source_group, + }); + } + + pub(crate) fn get_msix_cap_idx(&self) -> Option { + let mut cap_next = self + .vfio_wrapper + .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); + + while cap_next != 0 { + let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); + if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX { + return Some(cap_next as usize); + } else { + cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into()); + } + } + + None + } + + pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) { + let mut cap_iter = self + .vfio_wrapper + .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); + + let mut pci_express_cap_found = false; + let mut power_management_cap_found = false; + + while cap_iter != 0 { + let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into()); + + match PciCapabilityId::from(cap_id) { + PciCapabilityId::MessageSignalledInterrupts => { + if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { + if irq_info.count > 0 { + // Parse capability only if the VFIO device + // supports MSI. + let msg_ctl = self.parse_msi_capabilities(cap_iter); + self.initialize_msi(msg_ctl, cap_iter as u32, None); + } + } + } + PciCapabilityId::MsiX => { + if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) + { + if irq_info.count > 0 { + // Parse capability only if the VFIO device + // supports MSI-X. + let msix_cap = self.parse_msix_capabilities(cap_iter); + self.initialize_msix(msix_cap, cap_iter as u32, bdf, None); + } + } + } + PciCapabilityId::PciExpress => pci_express_cap_found = true, + PciCapabilityId::PowerManagement => power_management_cap_found = true, + _ => {} + }; + + let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into()); + if cap_next == 0 { + break; + } + + cap_iter = cap_next; + } + + if let Some(clique_id) = self.x_nv_gpudirect_clique { + self.add_nv_gpudirect_clique_cap(cap_iter, clique_id); + } + + if pci_express_cap_found && power_management_cap_found { + self.parse_extended_capabilities(); + } + } + + fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) { + // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space + // at 0xD4 for this capability. + let cap_offset = 0xd4u32; + + let reg_idx = (cap_iter / 4) as usize; + self.patches.insert( + reg_idx, + ConfigPatch { + mask: 0x0000_ff00, + patch: cap_offset << 8, + }, + ); + + let reg_idx = (cap_offset / 4) as usize; + self.patches.insert( + reg_idx, + ConfigPatch { + mask: 0xffff_ffff, + patch: 0x50080009u32, + }, + ); + self.patches.insert( + reg_idx + 1, + ConfigPatch { + mask: 0xffff_ffff, + patch: u32::from(clique_id) << 19 | 0x5032, + }, + ); + } + + fn parse_extended_capabilities(&mut self) { + let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; + + loop { + let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset); + + let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16; + let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16; + + match PciExpressCapabilityId::from(cap_id) { + PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation + | PciExpressCapabilityId::ResizeableBar + | PciExpressCapabilityId::SingleRootIoVirtualization => { + let reg_idx = (current_offset / 4) as usize; + self.patches.insert( + reg_idx, + ConfigPatch { + mask: 0x0000_ffff, + patch: PciExpressCapabilityId::NullCapability as u32, + }, + ); + } + _ => {} + } + + if cap_next == 0 { + break; + } + + current_offset = cap_next.into(); + } + } + + pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { + if let Some(intx) = &mut self.interrupt.intx { + if !intx.enabled { + if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { + self.vfio_wrapper + .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) + .map_err(VfioPciError::EnableIntx)?; + + intx.enabled = true; + } else { + return Err(VfioPciError::MissingNotifier); + } + } + } + + Ok(()) + } + + pub(crate) fn disable_intx(&mut self) { + if let Some(intx) = &mut self.interrupt.intx { + if intx.enabled { + if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { + error!("Could not disable INTx: {}", e); + } else { + intx.enabled = false; + } + } + } + } + + pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> { + if let Some(msi) = &self.interrupt.msi { + let mut irq_fds: Vec = Vec::new(); + for i in 0..msi.cfg.num_enabled_vectors() { + if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { + irq_fds.push(eventfd); + } else { + return Err(VfioPciError::MissingNotifier); + } + } + + self.vfio_wrapper + .enable_msi(irq_fds.iter().collect()) + .map_err(VfioPciError::EnableMsi)?; + } + + Ok(()) + } + + pub(crate) fn disable_msi(&self) { + if let Err(e) = self.vfio_wrapper.disable_msi() { + error!("Could not disable MSI: {}", e); + } + } + + pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> { + if let Some(msix) = &self.interrupt.msix { + let mut irq_fds: Vec = Vec::new(); + for i in 0..msix.bar.table_entries.len() { + if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { + irq_fds.push(eventfd); + } else { + return Err(VfioPciError::MissingNotifier); + } + } + + self.vfio_wrapper + .enable_msix(irq_fds.iter().collect()) + .map_err(VfioPciError::EnableMsix)?; + } + + Ok(()) + } + + pub(crate) fn disable_msix(&self) { + if let Err(e) = self.vfio_wrapper.disable_msix() { + error!("Could not disable MSI-X: {}", e); + } + } + + pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { + if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { + if irq_info.count == 0 { + // A count of 0 means the INTx IRQ is not supported, therefore + // it shouldn't be initialized. + return Ok(()); + } + } + + if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { + self.interrupt.intx = Some(VfioIntx { + interrupt_source_group, + enabled: false, + }); + + self.enable_intx()?; + } + + Ok(()) + } + + pub(crate) fn update_msi_capabilities( + &mut self, + offset: u64, + data: &[u8], + ) -> Result<(), VfioPciError> { + match self.interrupt.update_msi(offset, data) { + Some(InterruptUpdateAction::EnableMsi) => { + // Disable INTx before we can enable MSI + self.disable_intx(); + self.enable_msi()?; + } + Some(InterruptUpdateAction::DisableMsi) => { + // Fallback onto INTx when disabling MSI + self.disable_msi(); + self.enable_intx()?; + } + _ => {} + } + + Ok(()) + } + + pub(crate) fn update_msix_capabilities( + &mut self, + offset: u64, + data: &[u8], + ) -> Result<(), VfioPciError> { + match self.interrupt.update_msix(offset, data) { + Some(InterruptUpdateAction::EnableMsix) => { + // Disable INTx before we can enable MSI-X + self.disable_intx(); + self.enable_msix()?; + } + Some(InterruptUpdateAction::DisableMsix) => { + // Fallback onto INTx when disabling MSI-X + self.disable_msix(); + self.enable_intx()?; + } + _ => {} + } + + Ok(()) + } + + pub(crate) fn find_region(&self, addr: u64) -> Option { + for region in self.mmio_regions.iter() { + if addr >= region.start.raw_value() + && addr < region.start.unchecked_add(region.length).raw_value() + { + return Some(region.clone()); + } + } + None + } + + fn make_user_memory_region( + slot: u32, + guest_phys_addr: u64, + memory_size: u64, + userspace_addr: u64, + readonly: bool, + log_dirty_pages: bool, + ) -> MemoryRegion { + use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY}; + MemoryRegion { + slot, + guest_phys_addr, + memory_size, + userspace_addr, + flags: if readonly { KVM_MEM_READONLY } else { 0 } + | if log_dirty_pages { + KVM_MEM_LOG_DIRTY_PAGES + } else { + 0 + }, + } + } + + pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { + let addr = base + offset; + if let Some(region) = self.find_region(addr) { + let offset = addr - region.start.raw_value(); + + if self.interrupt.msix_table_accessed(region.index, offset) { + self.interrupt.msix_read_table(offset, data); + } else { + self.vfio_wrapper.region_read(region.index, offset, data); + } + } + + // INTx EOI + // The guest reading from the BAR potentially means the interrupt has + // been received and can be acknowledged. + if self.interrupt.intx_in_use() { + if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { + error!("Failed unmasking INTx IRQ: {}", e); + } + } + } + + pub(crate) fn write_bar( + &mut self, + base: u64, + offset: u64, + data: &[u8], + ) -> Option> { + let addr = base + offset; + if let Some(region) = self.find_region(addr) { + let offset = addr - region.start.raw_value(); + + // If the MSI-X table is written to, we need to update our cache. + if self.interrupt.msix_table_accessed(region.index, offset) { + self.interrupt.msix_write_table(offset, data); + } else { + self.vfio_wrapper.region_write(region.index, offset, data); + } + } + + // INTx EOI + // The guest writing to the BAR potentially means the interrupt has + // been received and can be acknowledged. + if self.interrupt.intx_in_use() { + if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { + error!("Failed unmasking INTx IRQ: {}", e); + } + } + + None + } + + pub(crate) fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + // When the guest wants to write to a BAR, we trap it into + // our local configuration space. We're not reprogramming + // VFIO device. + if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) + || reg_idx == PCI_ROM_EXP_BAR_INDEX + { + // We keep our local cache updated with the BARs. + // We'll read it back from there when the guest is asking + // for BARs (see read_config_register()). + self.configuration + .write_config_register(reg_idx, offset, data); + return None; + } + + let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; + + // If the MSI or MSI-X capabilities are accessed, we need to + // update our local cache accordingly. + // Depending on how the capabilities are modified, this could + // trigger a VFIO MSI or MSI-X toggle. + if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { + let cap_offset: u64 = reg - cap_base + offset; + match cap_id { + PciCapabilityId::MessageSignalledInterrupts => { + if let Err(e) = self.update_msi_capabilities(cap_offset, data) { + error!("Could not update MSI capabilities: {}", e); + } + } + PciCapabilityId::MsiX => { + if let Err(e) = self.update_msix_capabilities(cap_offset, data) { + error!("Could not update MSI-X capabilities: {}", e); + } + } + _ => {} + } + } + + // Make sure to write to the device's PCI config space after MSI/MSI-X + // interrupts have been enabled/disabled. In case of MSI, when the + // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), + // the MSI Enable bit in the MSI capability structure found in the PCI + // config space is disabled by default. That's why when the guest is + // enabling this bit, we first need to enable the MSI interrupts with + // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write + // to the device region to update the MSI Enable bit. + self.vfio_wrapper.write_config((reg + offset) as u32, data); + + None + } + + pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 { + // When reading the BARs, we trap it and return what comes + // from our local configuration space. We want the guest to + // use that and not the VFIO device BARs as it does not map + // with the guest address space. + if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) + || reg_idx == PCI_ROM_EXP_BAR_INDEX + { + return self.configuration.read_reg(reg_idx); + } + + if let Some(id) = self.get_msix_cap_idx() { + let msix = self.interrupt.msix.as_mut().unwrap(); + if reg_idx * 4 == id + 4 { + return msix.cap.table; + } else if reg_idx * 4 == id + 8 { + return msix.cap.pba; + } + } + + // Since we don't support passing multi-functions devices, we should + // mask the multi-function bit, bit 7 of the Header Type byte on the + // register 3. + let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { + 0xff7f_ffff + } else { + 0xffff_ffff + }; + + // The config register read comes from the VFIO device itself. + let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask; + + if let Some(config_patch) = self.patches.get(®_idx) { + value = (value & !config_patch.mask) | config_patch.patch; + } + + value + } + + fn state(&self) -> VfioCommonState { + let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState { + enabled: intx.enabled, + }); + + let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState { + cap: msi.cfg.cap, + cap_offset: msi.cap_offset, + }); + + let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState { + cap: msix.cap, + cap_offset: msix.cap_offset, + bdf: msix.bar.devid, + }); + + VfioCommonState { + intx_state, + msi_state, + msix_state, + } + } + + fn set_state( + &mut self, + state: &VfioCommonState, + msi_state: Option, + msix_state: Option, + ) -> Result<(), VfioPciError> { + if let (Some(intx), Some(interrupt_source_group)) = + (&state.intx_state, self.legacy_interrupt_group.clone()) + { + self.interrupt.intx = Some(VfioIntx { + interrupt_source_group, + enabled: false, + }); + + if intx.enabled { + self.enable_intx()?; + } + } + + if let Some(msi) = &state.msi_state { + self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state); + } + + if let Some(msix) = &state.msix_state { + self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state); + } + + Ok(()) + } +} + +/// VfioPciDevice represents a VFIO PCI device. +/// This structure implements the BusDevice and PciDevice traits. +/// +/// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. +/// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, +/// which then gets added to the PCI bus. +pub struct VfioPciDevice { + id: String, + vm: Arc>, + device: Arc, + container: Arc, + common: VfioCommon, + iommu_attached: bool, + memory_slot: Arc u32 + Send + Sync>, +} + +impl Debug for VfioPciDevice { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.debug_struct("VfioPciDevice") + .finish() + } +} + +impl VfioPciDevice { + /// Constructs a new Vfio Pci device for the given Vfio device + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + vm: Arc>, + device: VfioDevice, + container: Arc, + msi_interrupt_manager: Arc>, + legacy_interrupt_group: Option>, + iommu_attached: bool, + bdf: PciBdf, + memory_slot: Arc u32 + Send + Sync>, + x_nv_gpudirect_clique: Option, + ) -> Result { + let device = Arc::new(device); + device.reset(); + + let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); + + let common = VfioCommon::new( + msi_interrupt_manager, + legacy_interrupt_group, + Arc::new(vfio_wrapper) as Arc, + &PciVfioSubclass::VfioSubclass, + bdf, + x_nv_gpudirect_clique, + )?; + + let vfio_pci_device = VfioPciDevice { + id, + vm: vm.clone(), + device, + container, + common, + iommu_attached, + memory_slot, + }; + + Ok(vfio_pci_device) + } + + pub fn iommu_attached(&self) -> bool { + self.iommu_attached + } + + fn generate_sparse_areas( + caps: &[VfioRegionInfoCap], + region_index: u32, + region_start: u64, + region_size: u64, + vfio_msix: Option<&VfioMsix>, + ) -> Result, VfioPciError> { + for cap in caps { + match cap { + VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()), + VfioRegionInfoCap::MsixMappable => { + if !is_4k_aligned(region_start) { + error!( + "Region start address 0x{:x} must be at least aligned on 4KiB", + region_start + ); + return Err(VfioPciError::RegionAlignment); + } + if !is_4k_multiple(region_size) { + error!( + "Region size 0x{:x} must be at least a multiple of 4KiB", + region_size + ); + return Err(VfioPciError::RegionSize); + } + + // In case the region contains the MSI-X vectors table or + // the MSI-X PBA table, we must calculate the subregions + // around them, leading to a list of sparse areas. + // We want to make sure we will still trap MMIO accesses + // to these MSI-X specific ranges. If these region don't align + // with pagesize, we can achieve it by enlarging its range. + // + // Using a BtreeMap as the list provided through the iterator is sorted + // by key. This ensures proper split of the whole region. + let mut inter_ranges = BTreeMap::new(); + if let Some(msix) = vfio_msix { + if region_index == msix.cap.table_bir() { + let (offset, size) = msix.cap.table_range(); + let offset = align_page_size_down(offset); + let size = align_page_size_up(size); + inter_ranges.insert(offset, size); + } + if region_index == msix.cap.pba_bir() { + let (offset, size) = msix.cap.pba_range(); + let offset = align_page_size_down(offset); + let size = align_page_size_up(size); + inter_ranges.insert(offset, size); + } + } + + let mut sparse_areas = Vec::new(); + let mut current_offset = 0; + for (range_offset, range_size) in inter_ranges { + if range_offset > current_offset { + sparse_areas.push(VfioRegionSparseMmapArea { + offset: current_offset, + size: range_offset - current_offset, + }); + } + current_offset = align_page_size_down(range_offset + range_size); + } + + if region_size > current_offset { + sparse_areas.push(VfioRegionSparseMmapArea { + offset: current_offset, + size: region_size - current_offset, + }); + } + + return Ok(sparse_areas); + } + _ => {} + } + } + + // In case no relevant capabilities have been found, create a single + // sparse area corresponding to the entire MMIO region. + Ok(vec![VfioRegionSparseMmapArea { + offset: 0, + size: region_size, + }]) + } + + /// Map MMIO regions into the guest, and avoid VM exits when the guest tries + /// to reach those regions. + /// + /// # Arguments + /// + /// * `vm` - The VM object. It is used to set the VFIO MMIO regions + /// as user memory regions. + /// * `mem_slot` - The closure to return a memory slot. + pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { + let fd = self.device.as_raw_fd(); + + + for region in self.common.mmio_regions.iter_mut() { + let region_flags = self.device.get_region_flags(region.index); + if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { + let mut prot = 0; + if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { + prot |= libc::PROT_READ; + } + if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { + prot |= libc::PROT_WRITE; + } + + // Retrieve the list of capabilities found on the region + let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 { + self.device.get_region_caps(region.index) + } else { + Vec::new() + }; + + // Don't try to mmap the region if it contains MSI-X table or + // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE + // in the list of supported capabilities. + if let Some(msix) = self.common.interrupt.msix.as_ref() { + if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) + && !caps.contains(&VfioRegionInfoCap::MsixMappable) + { + continue; + } + } + + let mmap_size = self.device.get_region_size(region.index); + let mmap_offset = self.device.get_region_offset(region.index); + + let sparse_areas = Self::generate_sparse_areas( + &caps, + region.index, + region.start.0, + mmap_size, + self.common.interrupt.msix.as_ref(), + )?; + + for area in sparse_areas.iter() { + // SAFETY: FFI call with correct arguments + let host_addr = unsafe { + libc::mmap( + null_mut(), + area.size as usize, + prot, + libc::MAP_SHARED, + fd, + mmap_offset as libc::off_t + area.offset as libc::off_t, + ) + }; + + if host_addr == libc::MAP_FAILED { + error!( + "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", + area.offset, + area.size, + std::io::Error::last_os_error() + ); + return Err(VfioPciError::MmapArea); + } + + if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { + warn!( + "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})", + area.offset, + area.size, + ); + return Ok(()); + } + + let user_memory_region = UserMemoryRegion { + slot: (self.memory_slot)(), + start: region.start.0 + area.offset, + size: area.size, + host_addr: host_addr as u64, + }; + + region.user_memory_regions.push(user_memory_region); + + let mem_region = VfioCommon::make_user_memory_region( + user_memory_region.slot, + user_memory_region.start, + user_memory_region.size, + user_memory_region.host_addr, + false, + false, + ); + + unsafe { + self.vm.lock().expect("Poisoned lock") + .set_user_memory_region(mem_region) + .map_err(|e| VfioPciError::MapRegionGuest(e.into()))?; + } + + if !self.iommu_attached { + self.container + .vfio_dma_map( + user_memory_region.start, + user_memory_region.size, + user_memory_region.host_addr, + ) + .map_err(VfioPciError::DmaMap)?; + } + } + } + } + + Ok(()) + } + + pub fn unmap_mmio_regions(&mut self) { + for region in self.common.mmio_regions.iter() { + for user_memory_region in region.user_memory_regions.iter() { + // Unmap from vfio container + if !self.iommu_attached { + if let Err(e) = self + .container + .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) + { + error!("Could not unmap mmio region from vfio container: {}", e); + } + } + + // Remove region + let r = VfioCommon::make_user_memory_region( + user_memory_region.slot, + user_memory_region.start, + user_memory_region.size, + user_memory_region.host_addr, + false, + false, + ); + + if let Err(e) = unsafe { self.vm.lock().expect("Poisoned lock").set_user_memory_region(r) } { + error!("Could not remove the userspace memory region: {}", e); + } + + // SAFETY: FFI call with correct arguments + let ret = unsafe { + libc::munmap( + user_memory_region.host_addr as *mut libc::c_void, + user_memory_region.size as usize, + ) + }; + if ret != 0 { + error!( + "Could not unmap region {}, error:{}", + region.index, + io::Error::last_os_error() + ); + } + } + } + } + + pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { + if !self.iommu_attached { + self.container + .vfio_dma_map(iova, size, user_addr) + .map_err(VfioPciError::DmaMap)?; + } + + Ok(()) + } + + pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { + if !self.iommu_attached { + self.container + .vfio_dma_unmap(iova, size) + .map_err(VfioPciError::DmaUnmap)?; + } + + Ok(()) + } + + pub fn mmio_regions(&self) -> Vec { + self.common.mmio_regions.clone() + } +} + +impl Drop for VfioPciDevice { + fn drop(&mut self) { + self.unmap_mmio_regions(); + + if let Some(msix) = &self.common.interrupt.msix { + if msix.bar.enabled() { + self.common.disable_msix(); + } + } + + if let Some(msi) = &self.common.interrupt.msi { + if msi.cfg.enabled() { + self.common.disable_msi() + } + } + + if self.common.interrupt.intx_in_use() { + self.common.disable_intx(); + } + } +} + +impl VfioPciDevice { + pub fn bus_read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + pub fn bus_write(&mut self, base: u64, offset: u64, data: &[u8]) { + self.write_bar(base, offset, data); + () + } +} + +// First BAR offset in the PCI config space. +const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; +// Capability register offset in the PCI config space. +const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; +// Extended capabilities register offset in the PCI config space. +const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100; +// IO BAR when first BAR bit is 1. +const PCI_CONFIG_IO_BAR: u32 = 0x1; +// 64-bit memory bar flag. +const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; +// Prefetchable BAR bit +const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8; +// PCI config register size (4 bytes). +const PCI_CONFIG_REGISTER_SIZE: usize = 4; +// Number of BARs for a PCI device +const BAR_NUMS: usize = 6; +// PCI Header Type register index +const PCI_HEADER_TYPE_REG_INDEX: usize = 3; +// First BAR register index +const PCI_CONFIG_BAR0_INDEX: usize = 4; +// PCI ROM expansion BAR register index +const PCI_ROM_EXP_BAR_INDEX: usize = 12; + +impl PciDevice for VfioPciDevice { + fn allocate_bars( + &mut self, + allocator: &Arc>, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> Result, PciDeviceError> { + self.common + .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources) + } + + fn free_bars( + &mut self, + allocator: &mut SystemAllocator, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> Result<(), PciDeviceError> { + self.common + .free_bars(allocator, mmio32_allocator, mmio64_allocator) + } + + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.common.write_config_register(reg_idx, offset, data) + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.common.read_config_register(reg_idx) + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.common + .configuration + .detect_bar_reprogramming(reg_idx, data) + } + + fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.common.read_bar(base, offset, data) + } + + fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.common.write_bar(base, offset, data) + } + + fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { + for region in self.common.mmio_regions.iter_mut() { + if region.start.raw_value() == old_base { + region.start = GuestAddress(new_base); + + for user_memory_region in region.user_memory_regions.iter_mut() { + // Remove old region + let old_mem_region = VfioCommon::make_user_memory_region( + user_memory_region.slot, + user_memory_region.start, + user_memory_region.size, + user_memory_region.host_addr, + false, + false, + ); + + unsafe { self.vm.lock().expect("Poisoned lock").set_user_memory_region(old_mem_region) } + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + + // Update the user memory region with the correct start address. + if new_base > old_base { + user_memory_region.start += new_base - old_base; + } else { + user_memory_region.start -= old_base - new_base; + } + + // Insert new region + let new_mem_region = VfioCommon::make_user_memory_region( + user_memory_region.slot, + user_memory_region.start, + user_memory_region.size, + user_memory_region.host_addr, + false, + false, + ); + + unsafe { self.vm.lock().expect("Poisoned lock").set_user_memory_region(new_mem_region) } + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + } + } + } + + Ok(()) + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn id(&self) -> Option { + Some(self.id.clone()) + } +} + +/// This structure implements the ExternalDmaMapping trait. It is meant to +/// be used when the caller tries to provide a way to update the mappings +/// associated with a specific VFIO container. +pub struct VfioDmaMapping { + container: Arc, + memory: Arc, + mmio_regions: Arc>>, +} + +impl VfioDmaMapping { + /// Create a DmaMapping object. + /// # Parameters + /// * `container`: VFIO container object. + /// * `memory`: guest memory to mmap. + /// * `mmio_regions`: mmio_regions to mmap. + pub fn new( + container: Arc, + memory: Arc, + mmio_regions: Arc>>, + ) -> Self { + VfioDmaMapping { + container, + memory, + mmio_regions, + } + } +} + +impl ExternalDmaMapping for VfioDmaMapping { + fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> { + let mem = self.memory.memory(); + let guest_addr = GuestAddress(gpa); + let user_addr = if mem.check_range(guest_addr, size as usize) { + match mem.get_host_address(guest_addr) { + Ok(t) => t as u64, + Err(e) => { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}") + )); + } + } + } else if self.mmio_regions.lock().unwrap().check_range(gpa, size) { + self.mmio_regions.lock().unwrap().find_user_address(gpa)? + } else { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("failed to locate guest address 0x{gpa:x} in guest memory"), + )); + }; + + self.container + .vfio_dma_map(iova, size, user_addr) + .map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!( + "failed to map memory for VFIO container, \ + iova 0x{iova:x}, gpa 0x{gpa:x}, size 0x{size:x}: {e:?}" + ), + ) + }) + } + + fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> { + self.container.vfio_dma_unmap(iova, size).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!( + "failed to unmap memory for VFIO container, \ + iova 0x{iova:x}, size 0x{size:x}: {e:?}" + ), + ) + }) + } +} diff --git a/src/vm-device/Cargo.toml b/src/vm-device/Cargo.toml new file mode 100644 index 00000000000..8e19a548857 --- /dev/null +++ b/src/vm-device/Cargo.toml @@ -0,0 +1,18 @@ +[package] +authors = ["The Cloud Hypervisor Authors"] +edition = "2021" +name = "vm-device" +version = "0.1.0" + +[features] +default = [] +kvm = ["vfio-ioctls/kvm"] +mshv = ["vfio-ioctls/mshv"] + +[dependencies] +anyhow = "1.0.87" +serde = { version = "1.0.208", features = ["derive", "rc"] } +thiserror = "1.0.62" +vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", branch = "main" } +vm-memory = { version = "0.16.0", features = ["backend-mmap"] } +vmm-sys-util = ">=0.3.1" diff --git a/src/vm-device/src/bus.rs b/src/vm-device/src/bus.rs new file mode 100644 index 00000000000..3817d443f35 --- /dev/null +++ b/src/vm-device/src/bus.rs @@ -0,0 +1,377 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. + +//! Handles routing to devices in an address space. + +use std::cmp::Ordering; +use std::collections::btree_map::BTreeMap; +use std::sync::{Arc, Barrier, Mutex, RwLock, Weak}; +use std::{convert, error, fmt, io, result}; + +/// Trait for devices that respond to reads or writes in an arbitrary address space. +/// +/// The device does not care where it exists in address space as each method is only given an offset +/// into its allocated portion of address space. +#[allow(unused_variables)] +pub trait BusDevice: Send { + /// Reads at `offset` from this device + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {} + /// Writes at `offset` into this device + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + None + } +} + +#[allow(unused_variables)] +pub trait BusDeviceSync: Send + Sync { + /// Reads at `offset` from this device + fn read(&self, base: u64, offset: u64, data: &mut [u8]) {} + /// Writes at `offset` into this device + fn write(&self, base: u64, offset: u64, data: &[u8]) -> Option> { + None + } +} + +impl BusDeviceSync for Mutex { + /// Reads at `offset` from this device + fn read(&self, base: u64, offset: u64, data: &mut [u8]) { + self.lock() + .expect("Failed to acquire device lock") + .read(base, offset, data) + } + /// Writes at `offset` into this device + fn write(&self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.lock() + .expect("Failed to acquire device lock") + .write(base, offset, data) + } +} + +#[derive(Debug)] +pub enum Error { + /// The insertion failed because the new device overlapped with an old device. + Overlap, + /// Failed to operate on zero sized range. + ZeroSizedRange, + /// Failed to find address range. + MissingAddressRange, +} + +pub type Result = result::Result; + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "bus_error: {self:?}") + } +} + +impl error::Error for Error {} + +impl convert::From for io::Error { + fn from(e: Error) -> Self { + io::Error::new(io::ErrorKind::Other, e) + } +} + +/// Holds a base and length representing the address space occupied by a `BusDevice`. +/// +/// * base - The address at which the range start. +/// * len - The length of the range in bytes. +#[derive(Debug, Copy, Clone)] +pub struct BusRange { + pub base: u64, + pub len: u64, +} + +impl BusRange { + /// Returns true if there is overlap with the given range. + pub fn overlaps(&self, base: u64, len: u64) -> bool { + self.base < (base + len) && base < self.base + self.len + } +} + +impl Eq for BusRange {} + +impl PartialEq for BusRange { + fn eq(&self, other: &BusRange) -> bool { + self.base == other.base + } +} + +impl Ord for BusRange { + fn cmp(&self, other: &BusRange) -> Ordering { + self.base.cmp(&other.base) + } +} + +impl PartialOrd for BusRange { + fn partial_cmp(&self, other: &BusRange) -> Option { + Some(self.cmp(other)) + } +} + +/// A device container for routing reads and writes over some address space. +/// +/// This doesn't have any restrictions on what kind of device or address space this applies to. The +/// only restriction is that no two devices can overlap in this address space. +#[derive(Default)] +pub struct Bus { + devices: RwLock>>, +} + +impl Bus { + /// Constructs an a bus with an empty address space. + pub fn new() -> Bus { + Bus { + devices: RwLock::new(BTreeMap::new()), + } + } + + fn first_before(&self, addr: u64) -> Option<(BusRange, Arc)> { + let devices = self.devices.read().unwrap(); + let (range, dev) = devices + .range(..=BusRange { base: addr, len: 1 }) + .next_back()?; + dev.upgrade().map(|d| (*range, d.clone())) + } + + #[allow(clippy::type_complexity)] + fn resolve(&self, addr: u64) -> Option<(u64, u64, Arc)> { + if let Some((range, dev)) = self.first_before(addr) { + let offset = addr - range.base; + if offset < range.len { + return Some((range.base, offset, dev)); + } + } + None + } + + pub fn insert(&self, device: Arc, base: u64, len: u64) -> Result<()> { + if len == 0 { + return Err(Error::ZeroSizedRange); + } + + // Reject all cases where the new device's range overlaps with an existing device. + if self + .devices + .read() + .unwrap() + .iter() + .any(|(range, _dev)| range.overlaps(base, len)) + { + return Err(Error::Overlap); + } + + if self + .devices + .write() + .unwrap() + .insert(BusRange { base, len }, Arc::downgrade(&device)) + .is_some() + { + return Err(Error::Overlap); + } + + Ok(()) + } + + /// Removes the device at the given address space range. + pub fn remove(&self, base: u64, len: u64) -> Result<()> { + if len == 0 { + return Err(Error::ZeroSizedRange); + } + + let bus_range = BusRange { base, len }; + + if self.devices.write().unwrap().remove(&bus_range).is_none() { + return Err(Error::MissingAddressRange); + } + + Ok(()) + } + + /// Removes all entries referencing the given device. + pub fn remove_by_device(&self, device: &Arc) -> Result<()> { + let mut device_list = self.devices.write().unwrap(); + let mut remove_key_list = Vec::new(); + + for (key, value) in device_list.iter() { + if Arc::ptr_eq(&value.upgrade().unwrap(), device) { + remove_key_list.push(*key); + } + } + + for key in remove_key_list.iter() { + device_list.remove(key); + } + + Ok(()) + } + + /// Updates the address range for an existing device. + pub fn update_range( + &self, + old_base: u64, + old_len: u64, + new_base: u64, + new_len: u64, + ) -> Result<()> { + // Retrieve the device corresponding to the range + let device = if let Some((_, _, dev)) = self.resolve(old_base) { + dev.clone() + } else { + return Err(Error::MissingAddressRange); + }; + + // Remove the old address range + self.remove(old_base, old_len)?; + + // Insert the new address range + self.insert(device, new_base, new_len) + } + + /// Reads data from the device that owns the range containing `addr` and puts it into `data`. + /// + /// Returns true on success, otherwise `data` is untouched. + pub fn read(&self, addr: u64, data: &mut [u8]) -> Result<()> { + if let Some((base, offset, dev)) = self.resolve(addr) { + // OK to unwrap as lock() failing is a serious error condition and should panic. + dev.read(base, offset, data); + Ok(()) + } else { + Err(Error::MissingAddressRange) + } + } + + /// Writes `data` to the device that owns the range containing `addr`. + /// + /// Returns true on success, otherwise `data` is untouched. + pub fn write(&self, addr: u64, data: &[u8]) -> Result>> { + if let Some((base, offset, dev)) = self.resolve(addr) { + // OK to unwrap as lock() failing is a serious error condition and should panic. + Ok(dev.write(base, offset, data)) + } else { + Err(Error::MissingAddressRange) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + struct DummyDevice; + impl BusDeviceSync for DummyDevice {} + + struct ConstantDevice; + impl BusDeviceSync for ConstantDevice { + fn read(&self, _base: u64, offset: u64, data: &mut [u8]) { + for (i, v) in data.iter_mut().enumerate() { + *v = (offset as u8) + (i as u8); + } + } + + fn write(&self, _base: u64, offset: u64, data: &[u8]) -> Option> { + for (i, v) in data.iter().enumerate() { + assert_eq!(*v, (offset as u8) + (i as u8)) + } + + None + } + } + + #[test] + fn bus_insert() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + bus.insert(dummy.clone(), 0x10, 0).unwrap_err(); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + + let result = bus.insert(dummy.clone(), 0x0f, 0x10); + assert_eq!(format!("{result:?}"), "Err(Overlap)"); + + bus.insert(dummy.clone(), 0x10, 0x10).unwrap_err(); + bus.insert(dummy.clone(), 0x10, 0x15).unwrap_err(); + bus.insert(dummy.clone(), 0x12, 0x15).unwrap_err(); + bus.insert(dummy.clone(), 0x12, 0x01).unwrap_err(); + bus.insert(dummy.clone(), 0x0, 0x20).unwrap_err(); + bus.insert(dummy.clone(), 0x20, 0x05).unwrap(); + bus.insert(dummy.clone(), 0x25, 0x05).unwrap(); + bus.insert(dummy, 0x0, 0x10).unwrap(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn bus_read_write() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + bus.read(0x10, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x10, &[0, 0, 0, 0]).unwrap(); + bus.read(0x11, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x11, &[0, 0, 0, 0]).unwrap(); + bus.read(0x16, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x16, &[0, 0, 0, 0]).unwrap(); + bus.read(0x20, &mut [0, 0, 0, 0]).unwrap_err(); + bus.write(0x20, &[0, 0, 0, 0]).unwrap_err(); + bus.read(0x06, &mut [0, 0, 0, 0]).unwrap_err(); + bus.write(0x06, &[0, 0, 0, 0]).unwrap_err(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn bus_read_write_values() { + let bus = Bus::new(); + let dummy = Arc::new(ConstantDevice); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + + let mut values = [0, 1, 2, 3]; + bus.read(0x10, &mut values).unwrap(); + assert_eq!(values, [0, 1, 2, 3]); + bus.write(0x10, &values).unwrap(); + bus.read(0x15, &mut values).unwrap(); + assert_eq!(values, [5, 6, 7, 8]); + bus.write(0x15, &values).unwrap(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn busrange_cmp() { + let range = BusRange { base: 0x10, len: 2 }; + assert_eq!(range, BusRange { base: 0x10, len: 3 }); + assert_eq!(range, BusRange { base: 0x10, len: 2 }); + + assert!(range < BusRange { base: 0x12, len: 1 }); + assert!(range < BusRange { base: 0x12, len: 3 }); + + assert_eq!(range, range.clone()); + + let bus = Bus::new(); + let mut data = [1, 2, 3, 4]; + let device = Arc::new(DummyDevice); + bus.insert(device.clone(), 0x10, 0x10).unwrap(); + bus.write(0x10, &data).unwrap(); + bus.read(0x10, &mut data).unwrap(); + assert_eq!(data, [1, 2, 3, 4]); + } + + #[test] + fn bus_range_overlap() { + let a = BusRange { + base: 0x1000, + len: 0x400, + }; + assert!(a.overlaps(0x1000, 0x400)); + assert!(a.overlaps(0xf00, 0x400)); + assert!(a.overlaps(0x1000, 0x01)); + assert!(a.overlaps(0xfff, 0x02)); + assert!(a.overlaps(0x1100, 0x100)); + assert!(a.overlaps(0x13ff, 0x100)); + assert!(!a.overlaps(0x1400, 0x100)); + assert!(!a.overlaps(0xf00, 0x100)); + } +} diff --git a/src/vm-device/src/dma_mapping/mod.rs b/src/vm-device/src/dma_mapping/mod.rs new file mode 100644 index 00000000000..69cd880eea3 --- /dev/null +++ b/src/vm-device/src/dma_mapping/mod.rs @@ -0,0 +1,17 @@ +// Copyright © 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +/// Trait to trigger DMA mapping updates for devices managed by virtio-iommu +/// +/// Trait meant for triggering the DMA mapping update related to an external +/// device not managed fully through virtio. It is dedicated to virtio-iommu +/// in order to trigger the map update anytime the mapping is updated from the +/// guest. +pub trait ExternalDmaMapping: Send + Sync { + /// Map a memory range + fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), std::io::Error>; + + /// Unmap a memory range + fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), std::io::Error>; +} diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs new file mode 100644 index 00000000000..1f845b80197 --- /dev/null +++ b/src/vm-device/src/interrupt/mod.rs @@ -0,0 +1,199 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +//! Traits and Structs to manage interrupt sources for devices. +//! +//! In system programming, an interrupt is a signal to the processor emitted by hardware or +//! software indicating an event that needs immediate attention. An interrupt alerts the processor +//! to a high-priority condition requiring the interruption of the current code the processor is +//! executing. The processor responds by suspending its current activities, saving its state, and +//! executing a function called an interrupt handler (or an interrupt service routine, ISR) to deal +//! with the event. This interruption is temporary, and, after the interrupt handler finishes, +//! unless handling the interrupt has emitted a fatal error, the processor resumes normal +//! activities. +//! +//! Hardware interrupts are used by devices to communicate that they require attention from the +//! operating system, or a bare-metal program running on the CPU if there are no OSes. The act of +//! initiating a hardware interrupt is referred to as an interrupt request (IRQ). Different devices +//! are usually associated with different interrupts using a unique value associated with each +//! interrupt. This makes it possible to know which hardware device caused which interrupts. +//! These interrupt values are often called IRQ lines, or just interrupt lines. +//! +//! Nowadays, IRQ lines is not the only mechanism to deliver device interrupts to processors. +//! MSI [(Message Signaled Interrupt)](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) +//! is another commonly used alternative in-band method of signaling an interrupt, using special +//! in-band messages to replace traditional out-of-band assertion of dedicated interrupt lines. +//! While more complex to implement in a device, message signaled interrupts have some significant +//! advantages over pin-based out-of-band interrupt signaling. Message signaled interrupts are +//! supported in PCI bus since its version 2.2, and in later available PCI Express bus. Some +//! non-PCI architectures also use message signaled interrupts. +//! +//! While IRQ is a term commonly used by Operating Systems when dealing with hardware +//! interrupts, the IRQ numbers managed by OSes are independent of the ones managed by VMM. +//! For simplicity sake, the term `Interrupt Source` is used instead of IRQ to represent both +//! pin-based interrupts and MSI interrupts. +//! +//! A device may support multiple types of interrupts, and each type of interrupt may support one +//! or multiple interrupt sources. For example, a PCI device may support: +//! * Legacy Irq: exactly one interrupt source. +//! * PCI MSI Irq: 1,2,4,8,16,32 interrupt sources. +//! * PCI MSIx Irq: 2^n(n=0-11) interrupt sources. +//! +//! A distinct Interrupt Source Identifier (ISID) will be assigned to each interrupt source. +//! An ID allocator will be used to allocate and free Interrupt Source Identifiers for devices. +//! To decouple the vm-device crate from the ID allocator, the vm-device crate doesn't take the +//! responsibility to allocate/free Interrupt Source IDs but only makes use of assigned IDs. +//! +//! The overall flow to deal with interrupts is: +//! * The VMM creates an interrupt manager +//! * The VMM creates a device manager, passing on an reference to the interrupt manager +//! * The device manager passes on an reference to the interrupt manager to all registered devices +//! * The guest kernel loads drivers for virtual devices +//! * The guest device driver determines the type and number of interrupts needed, and update the +//! device configuration +//! * The virtual device backend requests the interrupt manager to create an interrupt group +//! according to guest configuration information + +use std::{fmt::{Debug, Formatter}, sync::Arc}; +use vmm_sys_util::eventfd::EventFd; + +/// Reuse std::io::Result to simplify interoperability among crates. +pub type Result = std::io::Result; + +/// Data type to store an interrupt source identifier. +pub type InterruptIndex = u32; + +/// Configuration data for legacy interrupts. +/// +/// On x86 platforms, legacy interrupts means those interrupts routed through PICs or IOAPICs. +#[derive(Copy, Clone, Debug)] +pub struct LegacyIrqSourceConfig { + pub irqchip: u32, + pub pin: u32, +} + +/// Configuration data for MSI/MSI-X interrupts. +/// +/// On x86 platforms, these interrupts are vectors delivered directly to the LAPIC. +#[derive(Copy, Clone, Debug, Default)] +pub struct MsiIrqSourceConfig { + /// High address to delivery message signaled interrupt. + pub high_addr: u32, + /// Low address to delivery message signaled interrupt. + pub low_addr: u32, + /// Data to write to delivery message signaled interrupt. + pub data: u32, + /// Unique ID of the device to delivery message signaled interrupt. + pub devid: u32, +} + +/// Configuration data for an interrupt source. +#[derive(Copy, Clone, Debug)] +pub enum InterruptSourceConfig { + /// Configuration data for Legacy interrupts. + LegacyIrq(LegacyIrqSourceConfig), + /// Configuration data for PciMsi, PciMsix and generic MSI interrupts. + MsiIrq(MsiIrqSourceConfig), +} + +/// Configuration data for legacy, pin based interrupt groups. +/// +/// A legacy interrupt group only takes one irq number as its configuration. +#[derive(Copy, Clone, Debug)] +pub struct LegacyIrqGroupConfig { + /// Legacy irq number. + pub irq: InterruptIndex, +} + +/// Configuration data for MSI/MSI-X interrupt groups +/// +/// MSI/MSI-X interrupt groups are basically a set of vectors. +#[derive(Copy, Clone, Debug)] +pub struct MsiIrqGroupConfig { + /// First index of the MSI/MSI-X interrupt vectors + pub base: InterruptIndex, + /// Number of vectors in the MSI/MSI-X group. + pub count: InterruptIndex, +} + +/// Trait to manage interrupt sources for virtual device backends. +/// +/// The InterruptManager implementations should protect itself from concurrent accesses internally, +/// so it could be invoked from multi-threaded context. +pub trait InterruptManager: Send + Sync { + type GroupConfig; + + /// Create an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object to manage + /// interrupt sources for a virtual device + /// + /// An [InterruptSourceGroup](trait.InterruptSourceGroup.html) object manages all interrupt + /// sources of the same type for a virtual device. + /// + /// # Arguments + /// * interrupt_type: type of interrupt source. + /// * base: base Interrupt Source ID to be managed by the group object. + /// * count: number of Interrupt Sources to be managed by the group object. + fn create_group(&self, config: Self::GroupConfig) -> Result>; + + /// Destroy an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object created by + /// [create_group()](trait.InterruptManager.html#tymethod.create_group). + /// + /// Assume the caller takes the responsibility to disable all interrupt sources of the group + /// before calling destroy_group(). This assumption helps to simplify InterruptSourceGroup + /// implementations. + fn destroy_group(&self, group: Arc) -> Result<()>; +} + +impl Debug for dyn InterruptManager { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "MsiIrqInterruptManager") + } +} + +pub trait InterruptSourceGroup: Send + Sync { + /// Enable the interrupt sources in the group to generate interrupts. + fn enable(&self) -> Result<()> { + // Not all interrupt sources can be enabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Disable the interrupt sources in the group to generate interrupts. + fn disable(&self) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Inject an interrupt from this interrupt source into the guest. + fn trigger(&self, index: InterruptIndex) -> Result<()>; + + /// Returns an interrupt notifier from this interrupt. + /// + /// An interrupt notifier allows for external components and processes + /// to inject interrupts into a guest, by writing to the file returned + /// by this method. + #[allow(unused_variables)] + fn notifier(&self, index: InterruptIndex) -> Option; + + /// Update the interrupt source group configuration. + /// + /// # Arguments + /// * index: sub-index into the group. + /// * config: configuration data for the interrupt source. + /// * masked: if the interrupt is masked + /// * set_gsi: whether update the GSI routing table. + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> Result<()>; + + /// Set the interrupt group GSI routing table. + fn set_gsi(&self) -> Result<()>; +} diff --git a/src/vm-device/src/lib.rs b/src/vm-device/src/lib.rs new file mode 100644 index 00000000000..c10731ea95a --- /dev/null +++ b/src/vm-device/src/lib.rs @@ -0,0 +1,60 @@ +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use serde::{Deserialize, Serialize}; + +mod bus; +pub mod dma_mapping; +pub mod interrupt; + +pub use self::bus::{Bus, BusDevice, BusDeviceSync, Error as BusError}; + +/// Type of Message Signalled Interrupt +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum MsiIrqType { + /// PCI MSI IRQ numbers. + PciMsi, + /// PCI MSIx IRQ numbers. + PciMsix, + /// Generic MSI IRQ numbers. + GenericMsi, +} + +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +pub enum PciBarType { + Io, + Mmio32, + Mmio64, +} + +/// Enumeration for device resources. +#[allow(missing_docs)] +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum Resource { + /// IO Port address range. + PioAddressRange { base: u16, size: u16 }, + /// Memory Mapped IO address range. + MmioAddressRange { base: u64, size: u64 }, + /// PCI BAR + PciBar { + index: usize, + base: u64, + size: u64, + type_: PciBarType, + prefetchable: bool, + }, + /// Legacy IRQ number. + LegacyIrq(u32), + /// Message Signaled Interrupt + MsiIrq { + ty: MsiIrqType, + base: u32, + size: u32, + }, + /// Network Interface Card MAC address. + MacAddress(String), + /// KVM memslot index. + KvmMemSlot(u32), +} diff --git a/src/vm-system-allocator/Cargo.toml b/src/vm-system-allocator/Cargo.toml new file mode 100644 index 00000000000..e0c2caf46cc --- /dev/null +++ b/src/vm-system-allocator/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "vm-system-allocator" +authors = ["The Chromium OS Authors"] +edition = "2021" +version = "0.1.0" + +[dependencies] +libc = "0.2.158" +vm-memory = "0.16.0" diff --git a/src/vm-system-allocator/src/address.rs b/src/vm-system-allocator/src/address.rs new file mode 100644 index 00000000000..14e0335cdac --- /dev/null +++ b/src/vm-system-allocator/src/address.rs @@ -0,0 +1,401 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Copyright © 2019 Intel Corporation +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::collections::btree_map::BTreeMap; +use std::result; + +use vm_memory::{Address, GuestAddress, GuestUsize}; + +#[derive(Debug)] +pub enum Error { + Overflow, + Overlap, + UnalignedAddress, +} + +pub type Result = result::Result; + +/// Manages allocating address ranges. +/// Use `AddressAllocator` whenever an address range needs to be allocated to different users. +/// +/// # Examples +/// +/// ``` +/// # use vm_allocator::AddressAllocator; +/// # use vm_memory::{Address, GuestAddress, GuestUsize}; +/// AddressAllocator::new(GuestAddress(0x1000), 0x10000).map(|mut pool| { +/// assert_eq!(pool.allocate(None, 0x110, Some(0x100)), Some(GuestAddress(0x10e00))); +/// assert_eq!(pool.allocate(None, 0x100, Some(0x100)), Some(GuestAddress(0x10d00))); +/// }); +/// ``` +#[derive(Debug, Eq, PartialEq)] +pub struct AddressAllocator { + base: GuestAddress, + end: GuestAddress, + ranges: BTreeMap, +} + +impl AddressAllocator { + /// Creates a new `AddressAllocator` for managing a range of addresses. + /// Can return `None` if `base` + `size` overflows a u64. + /// + /// * `base` - The starting address of the range to manage. + /// * `size` - The size of the address range in bytes. + pub fn new(base: GuestAddress, size: GuestUsize) -> Option { + if size == 0 { + return None; + } + + let end = base.checked_add(size - 1)?; + + let mut allocator = AddressAllocator { + base, + end, + ranges: BTreeMap::new(), + }; + + // Insert the last address as a zero size range. + // This is our end of address space marker. + allocator.ranges.insert(base.checked_add(size)?, 0); + + Some(allocator) + } + + fn align_address(&self, address: GuestAddress, alignment: GuestUsize) -> GuestAddress { + let align_adjust = if address.raw_value() % alignment != 0 { + alignment - (address.raw_value() % alignment) + } else { + 0 + }; + + address.unchecked_add(align_adjust) + } + + fn available_range( + &self, + req_address: GuestAddress, + req_size: GuestUsize, + alignment: GuestUsize, + ) -> Result { + let aligned_address = self.align_address(req_address, alignment); + + // The requested address should be aligned. + if aligned_address != req_address { + return Err(Error::UnalignedAddress); + } + + // The aligned address should be within the address space range. + if aligned_address >= self.end || aligned_address < self.base { + return Err(Error::Overflow); + } + + let mut prev_end_address = self.base; + for (address, size) in self.ranges.iter() { + if aligned_address <= *address { + // Do we overlap with the previous range? + if prev_end_address > aligned_address { + return Err(Error::Overlap); + } + + // Do we have enough space? + if address + .unchecked_sub(aligned_address.raw_value()) + .raw_value() + < req_size + { + return Err(Error::Overlap); + } + + return Ok(aligned_address); + } + + prev_end_address = address.unchecked_add(*size); + } + + // We have not found a range that starts after the requested address, + // despite having a marker at the end of our range. + Err(Error::Overflow) + } + + fn first_available_range( + &self, + req_size: GuestUsize, + alignment: GuestUsize, + ) -> Option { + let reversed_ranges: Vec<(&GuestAddress, &GuestUsize)> = self.ranges.iter().rev().collect(); + + for (idx, (address, _size)) in reversed_ranges.iter().enumerate() { + let next_range_idx = idx + 1; + let prev_end_address = if next_range_idx >= reversed_ranges.len() { + self.base + } else { + reversed_ranges[next_range_idx] + .0 + .unchecked_add(*(reversed_ranges[next_range_idx].1)) + }; + + // If we have enough space between this range and the previous one, + // we return the start of this range minus the requested size. + // As each new range is allocated at the end of the available address space, + // we will tend to always allocate new ranges there as well. In other words, + // ranges accumulate at the end of the address space. + if let Some(size_delta) = + address.checked_sub(self.align_address(prev_end_address, alignment).raw_value()) + { + let adjust = if alignment > 1 { alignment - 1 } else { 0 }; + if size_delta.raw_value() >= req_size { + return Some( + self.align_address(address.unchecked_sub(req_size + adjust), alignment), + ); + } + } + } + + None + } + + /// Allocates a range of addresses from the managed region. Returns `Some(allocated_address)` + /// when successful, or `None` if an area of `size` can't be allocated or if alignment isn't + /// a power of two. + pub fn allocate( + &mut self, + address: Option, + size: GuestUsize, + align_size: Option, + ) -> Option { + if size == 0 { + return None; + } + + let alignment = align_size.unwrap_or(4); + if !alignment.is_power_of_two() || alignment == 0 { + return None; + } + + let new_addr = match address { + Some(req_address) => match self.available_range(req_address, size, alignment) { + Ok(addr) => addr, + Err(_) => { + return None; + } + }, + None => self.first_available_range(size, alignment)?, + }; + + self.ranges.insert(new_addr, size); + + Some(new_addr) + } + + /// Free an already allocated address range. + /// We can only free a range if it matches exactly an already allocated range. + pub fn free(&mut self, address: GuestAddress, size: GuestUsize) { + if let Some(&range_size) = self.ranges.get(&address) { + if size == range_size { + self.ranges.remove(&address); + } + } + } + + /// Start address of the allocator + pub fn base(&self) -> GuestAddress { + self.base + } + + /// Last address of the allocator + pub fn end(&self) -> GuestAddress { + self.end + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn new_fails_overflow() { + assert_eq!(AddressAllocator::new(GuestAddress(u64::MAX), 0x100), None); + } + + #[test] + fn new_fails_size_zero() { + assert_eq!(AddressAllocator::new(GuestAddress(0x1000), 0), None); + } + + #[test] + fn allocate_fails_alignment_zero() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x10000).unwrap(); + assert_eq!( + pool.allocate(Some(GuestAddress(0x1000)), 0x100, Some(0)), + None + ); + } + + #[test] + fn allocate_fails_alignment_non_power_of_two() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x10000).unwrap(); + assert_eq!( + pool.allocate(Some(GuestAddress(0x1000)), 0x100, Some(200)), + None + ); + } + + #[test] + fn allocate_fails_not_enough_space() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + assert_eq!( + pool.allocate(None, 0x800, Some(0x100)), + Some(GuestAddress(0x1800)) + ); + assert_eq!(pool.allocate(None, 0x900, Some(0x100)), None); + assert_eq!( + pool.allocate(None, 0x400, Some(0x100)), + Some(GuestAddress(0x1400)) + ); + } + + #[test] + fn allocate_alignment() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x10000).unwrap(); + assert_eq!( + pool.allocate(None, 0x110, Some(0x100)), + Some(GuestAddress(0x10e00)) + ); + assert_eq!( + pool.allocate(None, 0x100, Some(0x100)), + Some(GuestAddress(0x10d00)) + ); + assert_eq!( + pool.allocate(None, 0x10, Some(0x100)), + Some(GuestAddress(0x10c00)) + ); + } + + #[test] + fn allocate_address() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, None), + Some(GuestAddress(0x1200)) + ); + + assert_eq!( + pool.allocate(Some(GuestAddress(0x1a00)), 0x100, None), + Some(GuestAddress(0x1a00)) + ); + } + + #[test] + fn allocate_address_alignment() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + + // Unaligned request + assert_eq!( + pool.allocate(Some(GuestAddress(0x1210)), 0x800, Some(0x100)), + None + ); + + // Aligned request + assert_eq!( + pool.allocate(Some(GuestAddress(0x1b00)), 0x100, Some(0x100)), + Some(GuestAddress(0x1b00)) + ); + } + + #[test] + fn allocate_address_not_enough_space() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + + // First range is [0x1200:0x1a00] + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + + // Second range is [0x1c00:0x1e00] + assert_eq!( + pool.allocate(Some(GuestAddress(0x1c00)), 0x200, Some(0x100)), + Some(GuestAddress(0x1c00)) + ); + + // There is 0x200 between the first 2 ranges. + // We ask for an available address but the range is too big + assert_eq!( + pool.allocate(Some(GuestAddress(0x1b00)), 0x800, Some(0x100)), + None + ); + + // We ask for an available address, with a small enough range + assert_eq!( + pool.allocate(Some(GuestAddress(0x1b00)), 0x100, Some(0x100)), + Some(GuestAddress(0x1b00)) + ); + } + + #[test] + fn allocate_address_free_and_realloc() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + + // First range is [0x1200:0x1a00] + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + + pool.free(GuestAddress(0x1200), 0x800); + + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + } + + #[test] + fn allocate_address_free_fail_and_realloc() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + + // First range is [0x1200:0x1a00] + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + + // We try to free a range smaller than the allocated one. + pool.free(GuestAddress(0x1200), 0x100); + + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + None + ); + } + + #[test] + fn allocate_address_fail_free_and_realloc() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + + // First allocation fails + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x2000, Some(0x100)), + None + ); + + // We try to free a range that was not allocated. + pool.free(GuestAddress(0x1200), 0x2000); + + // Now we try an allocation that should succeed. + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + } +} diff --git a/src/vm-system-allocator/src/gsi.rs b/src/vm-system-allocator/src/gsi.rs new file mode 100644 index 00000000000..19dec1776f5 --- /dev/null +++ b/src/vm-system-allocator/src/gsi.rs @@ -0,0 +1,114 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +#[cfg(target_arch = "x86_64")] +use std::collections::btree_map::BTreeMap; +use std::result; + +#[derive(Debug)] +pub enum Error { + Overflow, +} + +pub type Result = result::Result; + +/// GsiApic +#[cfg(target_arch = "x86_64")] +#[derive(Copy, Clone)] +pub struct GsiApic { + base: u32, + irqs: u32, +} + +#[cfg(target_arch = "x86_64")] +impl GsiApic { + /// New GSI APIC + pub fn new(base: u32, irqs: u32) -> Self { + GsiApic { base, irqs } + } +} + +/// GsiAllocator +pub struct GsiAllocator { + #[cfg(target_arch = "x86_64")] + apics: BTreeMap, + next_irq: u32, + next_gsi: u32, +} + +impl GsiAllocator { + #[cfg(target_arch = "x86_64")] + /// New GSI allocator + pub fn new(apics: Vec) -> Self { + let mut allocator = GsiAllocator { + apics: BTreeMap::new(), + next_irq: 0xffff_ffff, + next_gsi: 0, + }; + + for apic in &apics { + if apic.base < allocator.next_irq { + allocator.next_irq = apic.base; + } + + if apic.base + apic.irqs > allocator.next_gsi { + allocator.next_gsi = apic.base + apic.irqs; + } + + allocator.apics.insert(apic.base, apic.irqs); + } + + allocator + } + + #[cfg(target_arch = "aarch64")] + /// New GSI allocator + pub fn new() -> Self { + GsiAllocator { + next_irq: arch::IRQ_BASE, + next_gsi: arch::IRQ_BASE, + } + } + + /// Allocate a GSI + pub fn allocate_gsi(&mut self) -> Result { + let gsi = self.next_gsi; + self.next_gsi = self.next_gsi.checked_add(1).ok_or(Error::Overflow)?; + Ok(gsi) + } + + #[cfg(target_arch = "x86_64")] + /// Allocate an IRQ + pub fn allocate_irq(&mut self) -> Result { + let mut irq: u32 = 0; + for (base, irqs) in self.apics.iter() { + // HACKHACK - This only works with 1 single IOAPIC... + if self.next_irq >= *base && self.next_irq < *base + *irqs { + irq = self.next_irq; + self.next_irq += 1; + } + } + + if irq == 0 { + return Err(Error::Overflow); + } + + Ok(irq) + } + + #[cfg(target_arch = "aarch64")] + /// Allocate an IRQ + pub fn allocate_irq(&mut self) -> Result { + let irq = self.next_irq; + self.next_irq = self.next_irq.checked_add(1).ok_or(Error::Overflow)?; + Ok(irq) + } +} + +#[cfg(target_arch = "aarch64")] +impl Default for GsiAllocator { + fn default() -> Self { + GsiAllocator::new() + } +} diff --git a/src/vm-system-allocator/src/lib.rs b/src/vm-system-allocator/src/lib.rs new file mode 100644 index 00000000000..801eff9faaa --- /dev/null +++ b/src/vm-system-allocator/src/lib.rs @@ -0,0 +1,23 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Copyright © 2019 Intel Corporation +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +#![deny(missing_docs)] + +//! Manages system resources that can be allocated to VMs and their devices. + +mod address; +mod gsi; +/// page size related utility functions +pub mod page_size; +mod system; + +pub use crate::address::AddressAllocator; +pub use crate::gsi::GsiAllocator; +#[cfg(target_arch = "x86_64")] +pub use crate::gsi::GsiApic; +pub use crate::system::SystemAllocator; diff --git a/src/vm-system-allocator/src/page_size.rs b/src/vm-system-allocator/src/page_size.rs new file mode 100644 index 00000000000..96ae01edf79 --- /dev/null +++ b/src/vm-system-allocator/src/page_size.rs @@ -0,0 +1,38 @@ +// Copyright 2023 Arm Limited (or its affiliates). All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use libc::{sysconf, _SC_PAGESIZE}; + +/// get host page size +pub fn get_page_size() -> u64 { + // SAFETY: FFI call. Trivially safe. + unsafe { sysconf(_SC_PAGESIZE) as u64 } +} + +/// round up address to let it align page size +pub fn align_page_size_up(address: u64) -> u64 { + let page_size = get_page_size(); + (address + page_size - 1) & !(page_size - 1) +} + +/// round down address to let it align page size +pub fn align_page_size_down(address: u64) -> u64 { + let page_size = get_page_size(); + address & !(page_size - 1) +} + +/// Test if address is 4k aligned +pub fn is_4k_aligned(address: u64) -> bool { + (address & 0xfff) == 0 +} + +/// Test if size is 4k aligned +pub fn is_4k_multiple(size: u64) -> bool { + (size & 0xfff) == 0 +} + +/// Test if address is page size aligned +pub fn is_page_size_aligned(address: u64) -> bool { + let page_size = get_page_size(); + address & (page_size - 1) == 0 +} diff --git a/src/vm-system-allocator/src/system.rs b/src/vm-system-allocator/src/system.rs new file mode 100644 index 00000000000..f709ae101ad --- /dev/null +++ b/src/vm-system-allocator/src/system.rs @@ -0,0 +1,139 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Copyright © 2019 Intel Corporation +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::fmt::{Debug, Formatter}; + +use vm_memory::{GuestAddress, GuestUsize}; + +use crate::address::AddressAllocator; +use crate::gsi::GsiAllocator; +#[cfg(target_arch = "x86_64")] +use crate::gsi::GsiApic; +use crate::page_size::get_page_size; + +/// Manages allocating system resources such as address space and interrupt numbers. +/// +/// # Example - Use the `SystemAddress` builder. +/// +/// ``` +/// # #[cfg(target_arch = "x86_64")] +/// # use vm_allocator::{GsiApic, SystemAllocator}; +/// # #[cfg(target_arch = "aarch64")] +/// # use vm_allocator::SystemAllocator; +/// # use vm_memory::{Address, GuestAddress, GuestUsize}; +/// let mut allocator = SystemAllocator::new( +/// #[cfg(target_arch = "x86_64")] GuestAddress(0x1000), +/// #[cfg(target_arch = "x86_64")] 0x10000, +/// GuestAddress(0x10000000), 0x10000000, +/// #[cfg(target_arch = "x86_64")] vec![GsiApic::new(5, 19)]).unwrap(); +/// #[cfg(target_arch = "x86_64")] +/// assert_eq!(allocator.allocate_irq(), Some(5)); +/// #[cfg(target_arch = "aarch64")] +/// assert_eq!(allocator.allocate_irq(), Some(32)); +/// #[cfg(target_arch = "x86_64")] +/// assert_eq!(allocator.allocate_irq(), Some(6)); +/// #[cfg(target_arch = "aarch64")] +/// assert_eq!(allocator.allocate_irq(), Some(33)); +/// assert_eq!(allocator.allocate_platform_mmio_addresses(None, 0x1000, Some(0x1000)), Some(GuestAddress(0x1fff_f000))); +/// +/// ``` +pub struct SystemAllocator { + #[cfg(target_arch = "x86_64")] + io_address_space: AddressAllocator, + platform_mmio_address_space: AddressAllocator, + gsi_allocator: GsiAllocator, +} + +impl Debug for SystemAllocator { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("SystemAllocator") + .finish() + } +} + +impl SystemAllocator { + /// Creates a new `SystemAllocator` for managing addresses and irq numbers. + /// Can return `None` if `base` + `size` overflows a u64 + /// + /// * `io_base` - (X86) The starting address of IO memory. + /// * `io_size` - (X86) The size of IO memory. + /// * `platform_mmio_base` - The starting address of platform MMIO memory. + /// * `platform_mmio_size` - The size of platform MMIO memory. + /// * `apics` - (X86) Vector of APIC's. + /// + pub fn new( + #[cfg(target_arch = "x86_64")] io_base: GuestAddress, + #[cfg(target_arch = "x86_64")] io_size: GuestUsize, + platform_mmio_base: GuestAddress, + platform_mmio_size: GuestUsize, + #[cfg(target_arch = "x86_64")] apics: Vec, + ) -> Option { + Some(SystemAllocator { + #[cfg(target_arch = "x86_64")] + io_address_space: AddressAllocator::new(io_base, io_size)?, + platform_mmio_address_space: AddressAllocator::new( + platform_mmio_base, + platform_mmio_size, + )?, + #[cfg(target_arch = "x86_64")] + gsi_allocator: GsiAllocator::new(apics), + #[cfg(target_arch = "aarch64")] + gsi_allocator: GsiAllocator::new(), + }) + } + + /// Reserves the next available system irq number. + pub fn allocate_irq(&mut self) -> Option { + self.gsi_allocator.allocate_irq().ok() + } + + /// Reserves the next available GSI. + pub fn allocate_gsi(&mut self) -> Option { + self.gsi_allocator.allocate_gsi().ok() + } + + #[cfg(target_arch = "x86_64")] + /// Reserves a section of `size` bytes of IO address space. + pub fn allocate_io_addresses( + &mut self, + address: Option, + size: GuestUsize, + align_size: Option, + ) -> Option { + self.io_address_space + .allocate(address, size, Some(align_size.unwrap_or(0x1))) + } + + /// Reserves a section of `size` bytes of platform MMIO address space. + pub fn allocate_platform_mmio_addresses( + &mut self, + address: Option, + size: GuestUsize, + align_size: Option, + ) -> Option { + self.platform_mmio_address_space.allocate( + address, + size, + Some(align_size.unwrap_or_else(get_page_size)), + ) + } + + #[cfg(target_arch = "x86_64")] + /// Free an IO address range. + /// We can only free a range if it matches exactly an already allocated range. + pub fn free_io_addresses(&mut self, address: GuestAddress, size: GuestUsize) { + self.io_address_space.free(address, size) + } + + /// Free a platform MMIO address range. + /// We can only free a range if it matches exactly an already allocated range. + pub fn free_platform_mmio_addresses(&mut self, address: GuestAddress, size: GuestUsize) { + self.platform_mmio_address_space.free(address, size) + } +} diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index a55cf721e43..0da3ec70604 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -9,6 +9,7 @@ license = "Apache-2.0" bench = false [dependencies] +anyhow = "1.0.87" acpi_tables = { path = "../acpi-tables" } aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } arrayvec = { version = "0.7.6", optional = true } @@ -16,6 +17,7 @@ aws-lc-rs = { version = "1.10.0", features = ["bindgen"] } base64 = "0.22.1" bincode = "1.2.1" bitflags = "2.6.0" +byteorder = "1.4.3" crc64 = "2.0.0" derive_more = { version = "1.0.0", default-features = false, features = ["from", "display"] } displaydoc = "0.2.5" @@ -32,6 +34,7 @@ log-instrument = { path = "../log-instrument", optional = true } memfd = "0.6.3" micro_http = { git = "https://github.com/firecracker-microvm/micro-http" } +pci = { path = "../pci"} seccompiler = { path = "../seccompiler" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.214", features = ["derive", "rc"] } @@ -41,8 +44,12 @@ thiserror = "1.0.65" timerfd = "1.5.0" userfaultfd = "0.8.1" utils = { path = "../utils" } +uuid = "1.8.0" +vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", branch = "main" } vhost = { version = "0.12.1", features = ["vhost-user-frontend"] } vm-allocator = "0.1.0" +vm-system-allocator = { path = "../vm-system-allocator" } +vm-device = { path = "../vm-device"} vm-memory = { version = "0.16.0", features = ["backend-mmap", "backend-bitmap"] } vm-superio = "0.8.0" vmm-sys-util = { version = "0.12.1", features = ["with-serde"] } diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 247c832e665..3e6181d0473 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::fadt::{FADT_F_HW_REDUCED_ACPI, FADT_F_PWR_BUTTON, FADT_F_SLP_BUTTON}; -use acpi_tables::{aml, Aml, Dsdt, Fadt, Madt, Rsdp, Sdt, Xsdt}; +use acpi_tables::{aml, Aml, Dsdt, Fadt, Madt, Mcfg, Rsdp, Sdt, Xsdt}; use log::{debug, error}; use vm_allocator::AllocPolicy; @@ -12,6 +12,7 @@ use crate::acpi::x86_64::{ use crate::device_manager::acpi::ACPIDeviceManager; use crate::device_manager::mmio::MMIODeviceManager; use crate::device_manager::resources::ResourceAllocator; +use crate::devices::pci_segment::PciSegment; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; use crate::Vcpu; @@ -80,6 +81,7 @@ impl<'a> AcpiTableWriter<'a> { fn build_dsdt( &mut self, mmio_device_manager: &MMIODeviceManager, + pci_segment: Option<&PciSegment>, acpi_device_manager: &ACPIDeviceManager, ) -> Result { let mut dsdt_data = Vec::new(); @@ -93,6 +95,10 @@ impl<'a> AcpiTableWriter<'a> { // Architecture specific DSDT data setup_arch_dsdt(&mut dsdt_data)?; + if let Some(pci_segment) = pci_segment { + pci_segment.append_aml_bytes(&mut dsdt_data); + } + let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); self.write_acpi_table(&mut dsdt) } @@ -128,16 +134,34 @@ impl<'a> AcpiTableWriter<'a> { /// Build the XSDT table for the guest /// /// Currently, we pass to the guest just FADT and MADT tables. - fn build_xsdt(&mut self, fadt_addr: u64, madt_addr: u64) -> Result { + fn build_xsdt(&mut self, fadt_addr: u64, madt_addr: u64, mcfg_addr: Option) -> Result { + let tables = if let Some(mcfg_addr) = mcfg_addr { + vec![fadt_addr, madt_addr, mcfg_addr] + } else { + vec![fadt_addr, madt_addr] + }; let mut xsdt = Xsdt::new( OEM_ID, *b"FCMVXSDT", OEM_REVISION, - vec![fadt_addr, madt_addr], + tables, ); self.write_acpi_table(&mut xsdt) } + /// Build the XSDT table for the guest + /// + /// Currently, we pass to the guest just FADT and MADT tables. + fn build_mcfg(&mut self, pci_mmio_config_addr: u64) -> Result { + let mut mcfg = Mcfg::new( + OEM_ID, + *b"CHMCFG ", + OEM_REVISION, + pci_mmio_config_addr, + ); + self.write_acpi_table(&mut mcfg) + } + /// Build the RSDP pointer for the guest. /// /// This will build the RSDP pointer which points to the XSDT table and write it in guest @@ -166,6 +190,8 @@ pub(crate) fn create_acpi_tables( resource_allocator: &mut ResourceAllocator, mmio_device_manager: &MMIODeviceManager, acpi_device_manager: &ACPIDeviceManager, + pci_segment: Option<&PciSegment>, + pci_mmio_config_addr: u64, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { @@ -173,10 +199,11 @@ pub(crate) fn create_acpi_tables( resource_allocator, }; - let dsdt_addr = writer.build_dsdt(mmio_device_manager, acpi_device_manager)?; + let dsdt_addr = writer.build_dsdt(mmio_device_manager, pci_segment, acpi_device_manager)?; let fadt_addr = writer.build_fadt(dsdt_addr)?; let madt_addr = writer.build_madt(vcpus.len().try_into().unwrap())?; - let xsdt_addr = writer.build_xsdt(fadt_addr, madt_addr)?; + let mcfg_addr = pci_segment.map(|_| writer.build_mcfg(pci_mmio_config_addr)).transpose()?; + let xsdt_addr = writer.build_xsdt(fadt_addr, madt_addr, mcfg_addr)?; writer.build_rsdp(xsdt_addr) } diff --git a/src/vmm/src/acpi/x86_64.rs b/src/vmm/src/acpi/x86_64.rs index 7d2e9f0364a..43e029a6720 100644 --- a/src/vmm/src/acpi/x86_64.rs +++ b/src/vmm/src/acpi/x86_64.rs @@ -33,11 +33,11 @@ pub(crate) fn setup_arch_fadt(fadt: &mut Fadt) { // neither do we support ASPM, or MSI type of interrupts. // More info here: // https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html?highlight=0a06#ia-pc-boot-architecture-flags - fadt.setup_iapc_flags( - 1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT - | 1 << IAPC_BOOT_ARG_FLAGS_PCI_ASPM - | 1 << IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT, - ); + // fadt.setup_iapc_flags( + // 1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT + // | 1 << IAPC_BOOT_ARG_FLAGS_PCI_ASPM + // | 1 << IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT, + // ); } #[inline(always)] diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index f5a2f98cb7c..0f944a1c05c 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -1,6 +1,9 @@ // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +#![deny(missing_docs)] +//! Implements platform specific functionality. +//! Supported platforms: x86_64 and aarch64. use std::fmt; use serde::{Deserialize, Serialize}; @@ -25,7 +28,8 @@ pub use crate::arch::x86_64::{ arch_memory_regions, configure_system, get_kernel_start, initrd_load_addr, layout::APIC_ADDR, layout::CMDLINE_MAX_SIZE, layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, ConfigurationError, MMIO_MEM_SIZE, - MMIO_MEM_START, + MMIO_MEM_START, layout::PCI_MMCONFIG_SIZE, + layout::PCI_MMCONFIG_START, layout::MEM_32BIT_DEVICES_START, layout::MEM_32BIT_DEVICES_SIZE }; /// Types of devices that can get attached to this platform. diff --git a/src/vmm/src/arch/x86_64/layout.rs b/src/vmm/src/arch/x86_64/layout.rs index 01355b3018a..7ed1384f3b5 100644 --- a/src/vmm/src/arch/x86_64/layout.rs +++ b/src/vmm/src/arch/x86_64/layout.rs @@ -66,3 +66,24 @@ pub const SYSTEM_MEM_START: u64 = 0x9fc00; /// 257KiB is more than we need, however we reserve this space for potential future use of /// ACPI features (new tables and/or devices). pub const SYSTEM_MEM_SIZE: u64 = RSDP_ADDR - SYSTEM_MEM_START; + +// ** 32-bit reserved area (start: 3GiB, length: 1GiB) ** +/// MEM_32BIT_RESERVED_START +pub const MEM_32BIT_RESERVED_START: u64 = 0xc000_0000; + +/// MEM_32BIT_RESERVED_SIZE +pub const MEM_32BIT_RESERVED_SIZE: u64 = 1024 << 20; + +// Sub range: 32-bit PCI devices (start: 3GiB, length: 640Mib) +/// MEM_32BIT_DEVICES_START +pub const MEM_32BIT_DEVICES_START: u64 = MEM_32BIT_RESERVED_START; +/// MEM_32BIT_DEVICES_SIZE +pub const MEM_32BIT_DEVICES_SIZE: u64 = 640 << 20; + +// PCI MMCONFIG space (start: after the device space, length: 256MiB) +/// PCI_MMCONFIG_START +pub const PCI_MMCONFIG_START: u64 = MEM_32BIT_DEVICES_START + MEM_32BIT_DEVICES_SIZE; +/// PCI_MMCONFIG_SIZE +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; +/// PCI_MMIO_CONFIG_SIZE_PER_SEGMENT +pub const PCI_MMIO_CONFIG_SIZE_PER_SEGMENT: u64 = 4096 * 256; \ No newline at end of file diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index 1066d9734c3..d7e0eb2a8ab 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -21,6 +21,7 @@ pub mod regs; #[allow(missing_docs)] pub mod gen; +use layout::PCI_MMCONFIG_SIZE; use linux_loader::configurator::linux::LinuxBootConfigurator; use linux_loader::configurator::{BootConfigurator, BootParams}; use linux_loader::loader::bootparam::boot_params; @@ -158,6 +159,8 @@ pub fn configure_system( E820_RESERVED, )?; + add_e820_entry(&mut params, layout::PCI_MMCONFIG_START, PCI_MMCONFIG_SIZE, E820_RESERVED)?; + let last_addr = guest_mem.last_addr(); if last_addr < end_32bit_gap_start { add_e820_entry( diff --git a/src/vmm/src/arch/x86_64/msr.rs b/src/vmm/src/arch/x86_64/msr.rs index 325d6ed6b29..9e5a358dc5c 100644 --- a/src/vmm/src/arch/x86_64/msr.rs +++ b/src/vmm/src/arch/x86_64/msr.rs @@ -394,6 +394,11 @@ pub fn get_msrs_to_dump(kvm_fd: &Kvm) -> Result { Ok(msr_index_list) } +/// IA32_MTRR_DEF_TYPE MSR: E (MTRRs enabled) flag, bit 11 +pub const MTRR_ENABLE: u64 = 0x800; +/// Default - writeback memory +pub const MTRR_MEM_TYPE_WB: u64 = 0x6; + /// Creates and populates required MSR entries for booting Linux on X86_64. pub fn create_boot_msr_entries() -> Vec { let msr_entry_default = |msr| kvm_msr_entry { @@ -419,6 +424,11 @@ pub fn create_boot_msr_entries() -> Vec { data: u64::from(MSR_IA32_MISC_ENABLE_FAST_STRING), ..Default::default() }, + kvm_msr_entry { + index: MSR_MTRRdefType, + data: u64::from(MTRR_ENABLE | MTRR_MEM_TYPE_WB), + ..Default::default() + }, ] } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 1f52fdad063..ecbf3cd8cc0 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -5,13 +5,17 @@ #[cfg(target_arch = "x86_64")] use std::convert::TryFrom; -use std::fmt::Debug; -use std::io::{self, Seek, SeekFrom}; +use std::fmt::{Debug, Display, Formatter}; +use std::io::{self, Read, Seek, SeekFrom}; +use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use std::path::Path; #[cfg(feature = "gdb")] use std::sync::mpsc; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; +use kvm_bindings::{kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO}; +use kvm_ioctls::{DeviceFd, IoEventAddress, NoDatamatch, VmFd}; use libc::EFD_NONBLOCK; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; #[cfg(target_arch = "x86_64")] @@ -19,9 +23,13 @@ use linux_loader::loader::elf::Elf as Loader; #[cfg(target_arch = "aarch64")] use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::KernelLoader; +use pci::{DeviceRelocation, PciBarConfiguration, PciBarRegionType, PciBdf, PciDevice, VfioPciDevice, VfioPciError}; use seccompiler::BpfThreadMap; use userfaultfd::Uffd; use utils::time::TimestampUs; +use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd}; +use vm_system_allocator::{GsiApic, AddressAllocator, SystemAllocator}; +use vm_device::interrupt::{InterruptManager, MsiIrqGroupConfig}; use vm_memory::ReadVolatile; #[cfg(target_arch = "aarch64")] use vm_superio::Rtc; @@ -30,7 +38,7 @@ use vmm_sys_util::eventfd::EventFd; #[cfg(target_arch = "x86_64")] use crate::acpi; -use crate::arch::InitrdConfig; +use crate::arch::{InitrdConfig, MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, PCI_MMCONFIG_START}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{ @@ -40,7 +48,7 @@ use crate::cpu_config::templates::{ use crate::device_manager::acpi::ACPIDeviceManager; #[cfg(target_arch = "x86_64")] use crate::device_manager::legacy::PortIODeviceManager; -use crate::device_manager::mmio::MMIODeviceManager; +use crate::device_manager::mmio::{MMIODeviceManager, MmioError}; use crate::device_manager::persist::{ ACPIDeviceManagerConstructorArgs, ACPIDeviceManagerRestoreError, MMIODevManagerConstructorArgs, }; @@ -50,17 +58,21 @@ use crate::devices::legacy::serial::SerialOut; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; use crate::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; +use crate::devices::pci_segment::PciSegment; +use crate::devices::virtio::transport::VirtioPciDevice; +use pci::{PciBus, PciConfigIo, PciConfigMmio, PciRoot}; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; +use crate::devices::virtio::transport::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; -use crate::devices::BusDevice; +use crate::devices::{virtio, Bus, BusDevice}; #[cfg(feature = "gdb")] use crate::gdb; -use crate::logger::{debug, error}; +use crate::interrupt::MsiInterruptManager; +use crate::logger::{debug, info, error}; use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::snapshot::Persist; @@ -68,10 +80,10 @@ use crate::utils::u64_to_usize; use crate::vmm_config::boot_source::BootConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::{VmConfig, VmConfigError}; -use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; +use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuError}; use crate::vstate::vm::Vm; -use crate::{device_manager, EventManager, Vmm, VmmError}; +use crate::{device_manager, AddressManager, EventManager, Vmm, VmmError}; /// Errors associated with starting the instance. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -138,6 +150,12 @@ pub enum StartMicrovmError { /// Error cloning Vcpu fds #[cfg(feature = "gdb")] VcpuFdCloneError(#[from] crate::vstate::vcpu::CopyKvmFdError), + /// Error creating Vfio device + VfioError(vfio_ioctls::VfioError), + /// Error setting up Vfio PCI device + VfioPciError(VfioPciError), + /// TODO + Unknown, } /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -148,6 +166,183 @@ impl std::convert::From for StartMicrovmError { } } +fn create_passthrough_device(vm: &VmFd) -> DeviceFd { + let mut vfio_dev = kvm_create_device { + type_: kvm_device_type_KVM_DEV_TYPE_VFIO, + fd: 0, + flags: 0, + }; + + vm.create_device(&mut vfio_dev).unwrap() +} + +fn register_pci_device_mapping( + dev: Arc>, + #[cfg(target_arch = "x86_64")] io_bus: &mut Bus, + mmio_bus: &mut Bus, + bars: Vec, +) -> Result<(), VmmError> { + for bar in bars { + match bar.region_type() { + PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] + io_bus + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(|e| VmmError::DeviceManager(MmioError::BusInsert(e)))?; + #[cfg(not(target_arch = "x86_64"))] + error!("I/O region is not supported"); + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + mmio_bus + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(|e| VmmError::DeviceManager(MmioError::BusInsert(e)))?; + } + } + } + Ok(()) +} + +fn add_pci_device( + bus_device: Arc>, + pci_segment: &PciSegment, + dev_manager: &mut MMIODeviceManager, + pio_manager: &mut PortIODeviceManager, + allocator: Arc>, + bdf: PciBdf, +) -> Result<(), VmmError> { + let bars = bus_device.lock().unwrap().pci_device_mut().unwrap() + .allocate_bars( + &allocator, + &mut pci_segment + .mem32_allocator + .lock() + .unwrap(), + &mut pci_segment + .mem64_allocator + .lock() + .unwrap(), + None, + ) + .map_err(|_| VmmError::Unknown)?; + + let mut pci_bus = pci_segment + .pci_bus + .lock() + .unwrap(); + + pci_bus + .add_device(bdf.device() as u32, bus_device.clone()) + .map_err(|_| VmmError::Unknown)?; + + register_pci_device_mapping( + bus_device, + #[cfg(target_arch = "x86_64")] + &mut pio_manager.io_bus, + &mut dev_manager.bus, + bars.clone() + )?; + + Ok(()) +} + +fn add_vfio_device( + vmm: &mut Vmm, + fd: &DeviceFd, + device_path: &Path, + memory_slot: Arc u32 + Send + Sync>, +) -> Result<(), StartMicrovmError>{ + let pci_segment = vmm.pci_segment.as_ref().expect("pci should be enabled"); + + // We need to shift the device id since the 3 first bits + // are dedicated to the PCI function, and we know we don't + // do multifunction. Also, because we only support one PCI + // bus, the bus 0, we don't need to add anything to the + // global device ID. + let pci_device_id = pci_segment.pci_bus.lock().expect("bad lock").next_device_id().unwrap(); + let pci_device_bdf = pci_device_id << 3; + + // Safe because we know the RawFd is valid. + // + // This dup() is mandatory to be able to give full ownership of the + // file descriptor to the DeviceFd::from_raw_fd() function later in + // the code. + // + // This is particularly needed so that VfioContainer will still have + // a valid file descriptor even if DeviceManager, and therefore the + // passthrough_device are dropped. In case of Drop, the file descriptor + // would be closed, but Linux would still have the duplicated file + // descriptor opened from DeviceFd, preventing from unexpected behavior + // where the VfioContainer would try to use a closed file descriptor. + let dup_device_fd = unsafe { libc::dup(fd.as_raw_fd()) }; + + // SAFETY the raw fd conversion here is safe because: + // 1. This function is only called on KVM, see the feature guard above. + // 2. When running on KVM, passthrough_device wraps around DeviceFd. + // 3. The conversion here extracts the raw fd and then turns the raw fd into a DeviceFd + // of the same (correct) type. + let vfio_container = Arc::new( + VfioContainer::new(Some(Arc::new(VfioDeviceFd::new_from_kvm(unsafe { DeviceFd::from_raw_fd(dup_device_fd) })))) + .map_err(StartMicrovmError::VfioError)?, + ); + let vfio_device = VfioDevice::new(device_path, Arc::clone(&vfio_container)) + .map_err(StartMicrovmError::VfioError)?; + + let vfio_pci_device = + BusDevice::VfioPciDevice(VfioPciDevice::new( + pci_device_id.to_string(), + vmm.extra_fd.as_ref().expect("pci should be enabled").clone(), + vfio_device, + vfio_container.clone(), + vmm.msi_interrupt_manager.as_ref().expect("pci should be enabled").clone(), + None, + false, + pci_device_bdf.into(), + memory_slot, + None + ).unwrap()); + + let vfio_pci_device = Arc::new(Mutex::new(vfio_pci_device)); + + add_pci_device( + vfio_pci_device.clone(), + pci_segment, + &mut vmm.mmio_device_manager, + &mut vmm.pio_device_manager, + vmm.allocator.as_ref().expect("pci should be enabled").clone(), + pci_device_bdf.into() + ).unwrap(); + + // Register DMA mapping in IOMMU. + for (_index, region) in vmm.guest_memory.iter().enumerate() { + info!( + "Mapping DMA for {:x} len {:x} at hva {:x}", + region.start_addr().0, + region.len() as u64, + // memory.get_host_address(region.start_addr()).unwrap() as u64 + region.as_ptr() as u64 + ); + vfio_pci_device.lock().expect("poisoned lock") + .vfio_pci_device_ref() + .unwrap() + .dma_map( + region.start_addr().0, + region.len() as u64, + // memory.get_host_address(region.start_addr()).unwrap() as u64, + region.as_ptr() as u64 + ).map_err(StartMicrovmError::VfioPciError)?; + } + Ok(()) +} + +// The MMIO address space size is subtracted with 64k. This is done for the +// following reasons: +// - Reduce the addressable space size by at least 4k to workaround a Linux +// bug when the VMM allocates devices at the end of the addressable space +// - Windows requires the addressable space size to be 64k aligned +fn mmio_address_space_size(phys_bits: u8) -> u64 { + (1 << phys_bits) - (1 << 16) +} + #[cfg_attr(target_arch = "aarch64", allow(unused))] fn create_vmm_and_vcpus( instance_info: &InstanceInfo, @@ -157,12 +352,13 @@ fn create_vmm_and_vcpus( track_dirty_pages: bool, vcpu_count: u8, kvm_capabilities: Vec, + pci_enabled: bool, ) -> Result<(Vmm, Vec), StartMicrovmError> { use self::StartMicrovmError::*; // Set up Kvm Vm and register memory regions. // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(kvm_capabilities) + let (mut vm, extra_fd) = Vm::new(kvm_capabilities) .map_err(VmmError::Vm) .map_err(StartMicrovmError::Internal)?; vm.memory_init(&guest_memory, track_dirty_pages) @@ -176,7 +372,7 @@ fn create_vmm_and_vcpus( let resource_allocator = ResourceAllocator::new()?; // Instantiate the MMIO device manager. - let mmio_device_manager = MMIODeviceManager::new(); + let mut mmio_device_manager = MMIODeviceManager::new(); // Instantiate ACPI device manager. let acpi_device_manager = ACPIDeviceManager::new(); @@ -184,7 +380,7 @@ fn create_vmm_and_vcpus( // For x86_64 we need to create the interrupt controller before calling `KVM_CREATE_VCPUS` // while on aarch64 we need to do it the other way around. #[cfg(target_arch = "x86_64")] - let (vcpus, pio_device_manager) = { + let (vcpus, mut pio_device_manager) = { setup_interrupt_controller(&mut vm)?; let vcpus = create_vcpus(&vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; @@ -202,15 +398,101 @@ fn create_vmm_and_vcpus( .map_err(Internal)?; // create pio dev manager with legacy devices - let pio_device_manager = { - // TODO Remove these unwraps. - let mut pio_dev_mgr = PortIODeviceManager::new(serial_device, reset_evt).unwrap(); - pio_dev_mgr.register_devices(vm.fd()).unwrap(); - pio_dev_mgr - }; + let pio_dev_mgr = PortIODeviceManager::new(serial_device, reset_evt).unwrap(); - (vcpus, pio_device_manager) + (vcpus, pio_dev_mgr) }; + + let (pci_segment, msi_interrupt_manager, allocator, extra_fd) = if pci_enabled { + + // Create a system resources allocator. + // TODO: use ResourceAllocator + const NUM_IOAPIC_PINS: usize = 24; + const X86_64_IRQ_BASE: u32 = 5; + + let allocator = Arc::new(Mutex::new( + SystemAllocator::new( + #[cfg(target_arch = "x86_64")] + { + GuestAddress(0) + }, + #[cfg(target_arch = "x86_64")] + { + 1 << 16 + }, + GuestAddress(0), + mmio_address_space_size(46), + // GuestAddress(crate::arch::MEM_32BIT_DEVICES_START), + // crate::arch::MEM_32BIT_DEVICES_SIZE, + #[cfg(target_arch = "x86_64")] + vec![GsiApic::new( + X86_64_IRQ_BASE, + NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, + )], + ) + .unwrap() + )); + + let vm_fd = Arc::new(Mutex::new(extra_fd)); + // First we create the MSI interrupt manager, the legacy one is created + // later, after the IOAPIC device creation. + // The reason we create the MSI one first is because the IOAPIC needs it, + // and then the legacy interrupt manager needs an IOAPIC. So we're + // handling a linear dependency chain: + // msi_interrupt_manager <- IOAPIC <- legacy_interrupt_manager. + let msi_interrupt_manager: Arc> = + Arc::new(MsiInterruptManager::new( + Arc::clone(&allocator), + Arc::clone(&vm_fd), + )); + + // alignment 4 << 10 + let pci_mmio32_allocator = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(MEM_32BIT_DEVICES_START), MEM_32BIT_DEVICES_SIZE).unwrap(), + )); + + // alignment 4 << 30 + let pci_mmio64_allocator = Arc::new(Mutex::new( + AddressAllocator::new( + GuestAddress(0), + mmio_address_space_size(46), + ).unwrap() + )); + + // TODO: allocate GSI for legacy interrupts + // let irqs = resource_allocator.allocate_gsi(8).unwrap(); + // let mut pci_irq_slots: [u8; 32] = [0; 32]; + // for i in 0..32 { + // pci_irq_slots[i] = irqs[i % 8] as u8; + // } + let pci_irq_slots: [u8; 32] = [(NUM_IOAPIC_PINS-1) as u8; 32]; + + let address_manager = Arc::new(AddressManager{ + allocator: allocator.clone(), + io_bus: Arc::new(pio_device_manager.io_bus.clone()), + mmio_bus: Arc::new(mmio_device_manager.bus.clone()), + vm: vm_fd.clone(), + pci_mmio32_allocators: vec!(pci_mmio32_allocator.clone()), + pci_mmio64_allocators: vec!(pci_mmio64_allocator.clone()), + }); + let pci_segment = PciSegment::new( + 0, + 0, + pci_mmio32_allocator, + pci_mmio64_allocator, + &mut mmio_device_manager.bus, + &pci_irq_slots, + address_manager, + ).unwrap(); + let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&pci_segment.pci_bus))))); + pio_device_manager.put_pci_bus(pci_config_io); + + (Some(pci_segment), Some(msi_interrupt_manager), Some(allocator), Some(vm_fd)) + } else { + (None, None, None, None) + }; + + pio_device_manager.register_devices(vm.fd()).unwrap(); // On aarch64, the vCPUs need to be created (i.e call KVM_CREATE_VCPU) before setting up the // IRQ chip because the `KVM_CREATE_VCPU` ioctl will return error if the IRQCHIP @@ -237,6 +519,10 @@ fn create_vmm_and_vcpus( #[cfg(target_arch = "x86_64")] pio_device_manager, acpi_device_manager, + extra_fd, + pci_segment, + msi_interrupt_manager, + allocator, }; Ok((vmm, vcpus)) @@ -282,6 +568,7 @@ pub fn build_microvm_for_boot( vm_resources.vm_config.track_dirty_pages, vm_resources.vm_config.vcpu_count, cpu_template.kvm_capabilities.clone(), + vm_resources.pci_config.as_ref().map(|x| x.enabled).unwrap_or(false), )?; #[cfg(feature = "gdb")] @@ -328,6 +615,21 @@ pub fn build_microvm_for_boot( attach_entropy_device(&mut vmm, &mut boot_cmdline, entropy, event_manager)?; } + if let Some(vfio_devices) = vm_resources.pci_config.as_ref().map(|x| x.vfio_devices.as_ref()).flatten() { + let device_fd = create_passthrough_device(vmm.vm.fd()); + let memory_slot = Arc::new(move || { + // TODO use allocator for memory slots + static mut CURRENT: u32 = 1; + unsafe { + CURRENT += 1; + CURRENT + } + }); + for vfio_device in vfio_devices { + add_vfio_device(&mut vmm, &device_fd, Path::new(&vfio_device.path), memory_slot.clone())?; + } + } + #[cfg(target_arch = "aarch64")] attach_legacy_devices_aarch64(event_manager, &mut vmm, &mut boot_cmdline).map_err(Internal)?; @@ -472,6 +774,7 @@ pub fn build_microvm_from_snapshot( vm_resources.vm_config.track_dirty_pages, vm_resources.vm_config.vcpu_count, microvm_state.vm_state.kvm_cap_modifiers.clone(), + vm_resources.pci_config.as_ref().map(|x| x.enabled).unwrap_or(false), )?; #[cfg(target_arch = "x86_64")] @@ -835,6 +1138,8 @@ pub fn configure_system_for_boot( &mut vmm.resource_allocator, &vmm.mmio_device_manager, &vmm.acpi_device_manager, + vmm.pci_segment.as_ref(), + PCI_MMCONFIG_START, vcpus, )?; } @@ -867,6 +1172,22 @@ fn attach_virtio_device( device: Arc>, cmdline: &mut LoaderKernelCmdline, is_vhost_user: bool, +) -> Result<(), StartMicrovmError> { + if vmm.pci_segment.is_some() { + attach_virtio_pci_device(event_manager, vmm, id, device) + } else { + attach_virtio_mmio_device(event_manager, vmm, id, device, cmdline, is_vhost_user) + } +} + +/// Attaches a VirtioDevice device to the device manager and event manager. +fn attach_virtio_mmio_device( + event_manager: &mut EventManager, + vmm: &mut Vmm, + id: String, + device: Arc>, + cmdline: &mut LoaderKernelCmdline, + is_vhost_user: bool, ) -> Result<(), StartMicrovmError> { use self::StartMicrovmError::*; @@ -886,6 +1207,67 @@ fn attach_virtio_device( .map(|_| ()) } +fn attach_virtio_pci_device( + event_manager: &mut EventManager, + vmm: &mut Vmm, + id: String, + device: Arc>, +) -> Result<(), StartMicrovmError>{ + event_manager.add_subscriber(device.clone()); + let pci_segment = vmm.pci_segment.as_ref().expect("pci should be enabled"); + let pci_segment_id = pci_segment.id; + let pci_device_bdf = pci_segment.next_device_bdf().map_err(|_| StartMicrovmError::Unknown)?; + + // Allows support for one MSI-X vector per queue. It also adds 1 + // as we need to take into account the dedicated vector to notify + // about a virtio config change. + let msix_num = (device.lock().unwrap().queues().len() + 1) as u16; + + let memory = vmm.guest_memory().clone(); + + let device_type = device.lock().unwrap().device_type(); + let virtio_pci_device = Arc::new(Mutex::new( + BusDevice::VirtioPciDevice(VirtioPciDevice::new( + id.clone(), + memory, + device, + msix_num, + vmm.msi_interrupt_manager.as_ref().expect("pci should be enabled"), + pci_device_bdf.into(), + // All device types *except* virtio block devices should be allocated a 64-bit bar + // The block devices should be given a 32-bit BAR so that they are easily accessible + // to firmware without requiring excessive identity mapping. + // The exception being if not on the default PCI segment. + pci_segment_id > 0 || device_type != virtio::TYPE_BLOCK, + None, + ) + .map_err(|_| StartMicrovmError::Unknown)?, + ))); + + add_pci_device( + virtio_pci_device.clone(), + pci_segment, + &mut vmm.mmio_device_manager, + &mut vmm.pio_device_manager, + vmm.allocator.as_ref().expect("pci should be enabled").clone(), + pci_device_bdf, + ).map_err(|_| StartMicrovmError::Unknown)?; + + let bar_addr = virtio_pci_device.lock().unwrap().virtio_pci_device_ref().unwrap().config_bar_addr(); + for (i, queue_evt) in virtio_pci_device.lock().unwrap().virtio_pci_device_ref().unwrap().virtio_device().lock().unwrap().queue_events().iter().enumerate() { + const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; + const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = IoEventAddress::Mmio( + notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER) + ); + vmm.vm.fd().register_ioevent(queue_evt, &io_addr, NoDatamatch) + .map_err(MmioError::RegisterIoEvent)?; + } + + Ok(()) +} + pub(crate) fn attach_boot_timer_device( vmm: &mut Vmm, request_ts: TimestampUs, @@ -1030,6 +1412,7 @@ pub mod tests { use super::*; use crate::arch::DeviceType; use crate::device_manager::resources::ResourceAllocator; + use crate::devices::bus::DummyDevice; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::rng::device::ENTROPY_DEV_ID; use crate::devices::virtio::vsock::{TYPE_VSOCK, VSOCK_DEV_ID}; @@ -1098,6 +1481,24 @@ pub mod tests { .unwrap() } + struct DummyDeviceRelocation; + impl DeviceRelocation for DummyDeviceRelocation { + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + _pci_dev: &mut dyn PciDevice, + _region_type: PciBarRegionType, + ) -> std::result::Result<(), io::Error> { + error!( + "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", + old_base, new_base, len + ); + Ok(()) + } + } + pub(crate) fn default_vmm() -> Vmm { let guest_memory = arch_mem(128 << 20); @@ -1106,9 +1507,9 @@ pub mod tests { .map_err(StartMicrovmError::Internal) .unwrap(); - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, extra_fd) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_memory, false).unwrap(); - let mmio_device_manager = MMIODeviceManager::new(); + let mut mmio_device_manager = MMIODeviceManager::new(); let acpi_device_manager = ACPIDeviceManager::new(); #[cfg(target_arch = "x86_64")] let pio_device_manager = PortIODeviceManager::new( @@ -1136,6 +1537,54 @@ pub mod tests { setup_interrupt_controller(&mut vm, 1).unwrap(); } + let pci_mmio32_allocator = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(MEM_32BIT_DEVICES_START), MEM_32BIT_DEVICES_SIZE).unwrap(), + )); + + // alignment 4 << 30 + let pci_mmio64_allocator = Arc::new(Mutex::new( + AddressAllocator::new( + GuestAddress(0), + mmio_address_space_size(46), + ).unwrap() + )); + + let pci_segment = PciSegment::new( + 0, + 0, + pci_mmio32_allocator, + pci_mmio64_allocator, + &mut mmio_device_manager.bus, + &[0u8; 32], + Arc::new(DummyDeviceRelocation{}), + ).unwrap(); + + let allocator = Arc::new(Mutex::new( + SystemAllocator::new( + #[cfg(target_arch = "x86_64")] + { + GuestAddress(0) + }, + #[cfg(target_arch = "x86_64")] + { + 1 << 16 + }, + GuestAddress(0), + mmio_address_space_size(46), + // GuestAddress(crate::arch::MEM_32BIT_DEVICES_START), + // crate::arch::MEM_32BIT_DEVICES_SIZE, + #[cfg(target_arch = "x86_64")] + vec![], + ) + .unwrap() + )); + let vm_fd = Arc::new(Mutex::new(extra_fd)); + let msi_interrupt_manager: Arc> = + Arc::new(MsiInterruptManager::new( + Arc::clone(&allocator), + vm_fd.clone(), + )); + Vmm { events_observer: Some(std::io::stdin()), instance_info: InstanceInfo::default(), @@ -1150,6 +1599,10 @@ pub mod tests { #[cfg(target_arch = "x86_64")] pio_device_manager, acpi_device_manager, + extra_fd: Some(vm_fd), + pci_segment: Some(pci_segment), + msi_interrupt_manager: Some(msi_interrupt_manager), + allocator: Some(allocator), } } @@ -1358,7 +1811,7 @@ pub mod tests { let guest_memory = arch_mem(128 << 20); #[allow(unused_mut)] - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_memory, false).unwrap(); let evfd = EventFd::new(libc::EFD_NONBLOCK).unwrap(); diff --git a/src/vmm/src/cpu_config/x86_64/cpuid/intel/normalize.rs b/src/vmm/src/cpu_config/x86_64/cpuid/intel/normalize.rs index 74536e44241..8b3b5efdbae 100644 --- a/src/vmm/src/cpu_config/x86_64/cpuid/intel/normalize.rs +++ b/src/vmm/src/cpu_config/x86_64/cpuid/intel/normalize.rs @@ -46,8 +46,8 @@ pub enum DeterministicCacheError { /// We always use this brand string. pub const DEFAULT_BRAND_STRING: &[u8; BRAND_STRING_LENGTH] = - b"Intel(R) Xeon(R) Processor\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; -pub const DEFAULT_BRAND_STRING_BASE: &[u8; 28] = b"Intel(R) Xeon(R) Processor @"; + b"Intel(R) Xeon(R) Platinum 8259CL CPU\0\0\0\0\0\0\0\0\0\0\0\0"; +pub const DEFAULT_BRAND_STRING_BASE: &[u8; 38] = b"Intel(R) Xeon(R) Platinum 8259CL CPU @"; // We use this 2nd implementation so we can conveniently define functions only used within // `normalize`. diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 45842d933b2..c1354739914 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -39,6 +39,7 @@ pub struct PortIODeviceManager { pub stdio_serial: Arc>, // BusDevice::I8042Device pub i8042: Arc>, + pub pci_bus: Option>>, // Communication event on ports 1 & 3. pub com_evt_1_3: EventFdTrigger, @@ -97,12 +98,17 @@ impl PortIODeviceManager { io_bus, stdio_serial: serial, i8042, + pci_bus: None, com_evt_1_3, com_evt_2_4, kbd_evt, }) } + pub fn put_pci_bus(&mut self, pci_bus: Arc>) { + self.pci_bus = Some(pci_bus); + } + /// Register supported legacy devices. pub fn register_devices(&mut self, vm_fd: &VmFd) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { @@ -125,6 +131,13 @@ impl PortIODeviceManager { ), input: None, }))); + if let Some(ref pci_bus) = self.pci_bus { + self.io_bus.insert( + pci_bus.clone(), + 0xcf8, + 0x8 + )?; + } self.io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], @@ -244,13 +257,14 @@ impl PortIODeviceManager { #[cfg(test)] mod tests { use super::*; + use crate::devices::bus::DummyDevice; use crate::test_utils::single_region_mem; use crate::Vm; #[test] fn test_register_legacy_devices() { let guest_mem = single_region_mem(0x1000); - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); crate::builder::setup_interrupt_controller(&mut vm).unwrap(); let mut ldm = PortIODeviceManager::new( diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 00c155abcfd..730a0078348 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -22,20 +22,20 @@ use vm_allocator::AllocPolicy; use super::resources::ResourceAllocator; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::DeviceInfoForFDT; -use crate::arch::DeviceType; +use crate::arch::{self, DeviceType}; use crate::arch::DeviceType::Virtio; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; use crate::devices::pseudo::BootTimer; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; -use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; +use crate::devices::virtio::device::{VirtioDevice, VirtioInterruptType}; +use crate::devices::virtio::transport::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend, TYPE_VSOCK}; use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; -use crate::devices::BusDevice; +use crate::devices::{BusDevice, BusError}; #[cfg(target_arch = "x86_64")] use crate::vstate::memory::GuestAddress; @@ -121,6 +121,7 @@ fn add_virtio_aml( #[derive(Debug)] pub struct MMIODeviceManager { pub(crate) bus: crate::devices::Bus, + pci_bus: Option>>, pub(crate) id_to_dev_info: HashMap<(DeviceType, String), MMIODeviceInfo>, // We create the AML byte code for every VirtIO device in the order we build // it, so that we ensure the root block device is appears first in the DSDT. @@ -137,6 +138,7 @@ impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { MMIODeviceManager { + pci_bus: None, bus: crate::devices::Bus::new(), id_to_dev_info: HashMap::new(), #[cfg(target_arch = "x86_64")] @@ -163,6 +165,19 @@ impl MMIODeviceManager { Ok(device_info) } + /// Register the PCI bus. + pub fn register_pci_bus(&mut self, pci_bus: Arc>) -> Result<(), MmioError> { + self.bus + .insert( + Arc::clone(&pci_bus), + arch::PCI_MMCONFIG_START, + arch::PCI_MMCONFIG_SIZE, + ) + .map_err(MmioError::BusInsert)?; + self.pci_bus = Some(pci_bus); + Ok(()) + } + /// Register a device at some MMIO address. fn register_mmio_device( &mut self, @@ -202,7 +217,9 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIoEvent)?; } vm.register_irqfd( - &locked_device.interrupt_trigger().irq_evt, + &locked_device.interrupt() + .notifier(VirtioInterruptType::Queue(0)) + .expect("mmio device should have evenfd"), device_info.irqs[0], ) .map_err(MmioError::RegisterIrqFd)?; @@ -367,12 +384,12 @@ impl MMIODeviceManager { &self, device_type: DeviceType, device_id: &str, - ) -> Option<&Mutex> { + ) -> Option>> { if let Some(device_info) = self .id_to_dev_info .get(&(device_type, device_id.to_string())) { - if let Some((_, device)) = self.bus.get_device(device_info.addr) { + if let Some((_, _, device)) = self.bus.get_device(device_info.addr) { return Some(device); } } @@ -382,7 +399,7 @@ impl MMIODeviceManager { /// Run fn for each registered device. pub fn for_each_device(&self, mut f: F) -> Result<(), E> where - F: FnMut(&DeviceType, &String, &MMIODeviceInfo, &Mutex) -> Result<(), E>, + F: FnMut(&DeviceType, &String, &MMIODeviceInfo, Arc>) -> Result<(), E>, { for ((device_type, device_id), device_info) in self.get_device_info().iter() { let bus_device = self @@ -502,7 +519,8 @@ impl MMIODeviceManager { .unwrap(); if vsock.is_activated() { info!("kick vsock {id}."); - vsock.signal_used_queue().unwrap(); + // TODO should we kick rx as well? + vsock.signal_used_queue(1).unwrap(); } } TYPE_RNG => { @@ -540,7 +558,7 @@ mod tests { use vmm_sys_util::eventfd::EventFd; use super::*; - use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; + use crate::devices::virtio::device::{IrqTrigger, VirtioDevice, VirtioInterrupt}; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::ActivateError; use crate::test_utils::multi_region_mem; @@ -587,7 +605,7 @@ mod tests { dummy: u32, queues: Vec, queue_evts: [EventFd; 1], - interrupt_trigger: IrqTrigger, + interrupt: Arc, } impl DummyDevice { @@ -596,7 +614,7 @@ mod tests { dummy: 0, queues: QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(), queue_evts: [EventFd::new(libc::EFD_NONBLOCK).expect("cannot create eventFD")], - interrupt_trigger: IrqTrigger::new().expect("cannot create eventFD"), + interrupt: Arc::new(IrqTrigger::new().expect("cannot create eventFD")), } } } @@ -628,8 +646,8 @@ mod tests { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.interrupt_trigger + fn interrupt(&self) -> Arc { + self.interrupt.clone() } fn ack_features_by_page(&mut self, page: u32, value: u32) { @@ -647,7 +665,7 @@ mod tests { let _ = data; } - fn activate(&mut self, _: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, _: GuestMemoryMmap, _: Option>) -> Result<(), ActivateError> { Ok(()) } @@ -661,7 +679,7 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -690,7 +708,7 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -744,7 +762,7 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mem_clone = guest_mem.clone(); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 5773fa0ba09..134a1d3a5ae 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -24,7 +24,7 @@ use crate::devices::virtio::block::device::Block; use crate::devices::virtio::block::persist::{BlockConstructorArgs, BlockState}; use crate::devices::virtio::block::BlockError; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; +use crate::devices::virtio::transport::MmioTransport; use crate::devices::virtio::net::persist::{ NetConstructorArgs, NetPersistError as NetError, NetState, }; diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index 2b016d73083..c278397c3e1 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -7,15 +7,19 @@ //! Handles routing to devices in an address space. +use std::any::Any; use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; use std::collections::btree_map::BTreeMap; -use std::sync::{Arc, Mutex}; +use std::result::Result; +use std::sync::{Arc, Barrier, Mutex, RwLock}; /// Errors triggered during bus operations. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum BusError { - /// New device overlaps with an old device. + /// The insertion failed because the new device overlapped with an old device. Overlap, + /// The relocation failed because no device was mapped at the address + MissingAddressRange, } #[derive(Debug, Copy, Clone)] @@ -47,16 +51,21 @@ impl PartialOrd for BusRange { /// only restriction is that no two devices can overlap in this address space. #[derive(Debug, Clone, Default)] pub struct Bus { - devices: BTreeMap>>, + devices: Arc>>>>, } use event_manager::{EventOps, Events, MutEventSubscriber}; +use pci::{BarReprogrammingParams, PciBarConfiguration, PciDevice, VfioPciDevice}; +use pci::device::Error as PciDeviceError; +use vm_device::Resource; +use vm_system_allocator::{AddressAllocator, SystemAllocator}; #[cfg(target_arch = "aarch64")] use super::legacy::RTCDevice; use super::legacy::{I8042Device, SerialDevice}; +use pci::{PciConfigIo, PciConfigMmio, PciRoot}; use super::pseudo::BootTimer; -use super::virtio::mmio::MmioTransport; +use super::virtio::transport::{MmioTransport, VirtioPciDevice}; #[derive(Debug)] pub enum BusDevice { @@ -66,6 +75,10 @@ pub enum BusDevice { BootTimer(BootTimer), MmioTransport(MmioTransport), Serial(SerialDevice), + PioPciBus(PciConfigIo), + MmioPciBus(PciConfigMmio), + VfioPciDevice(VfioPciDevice), + VirtioPciDevice(VirtioPciDevice), #[cfg(test)] Dummy(DummyDevice), #[cfg(test)] @@ -165,8 +178,70 @@ impl BusDevice { _ => None, } } + pub fn vfio_pci_device_ref(&self) -> Option<&VfioPciDevice> { + match self { + Self::VfioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn vfio_pci_device_mut(&mut self) -> Option<&mut VfioPciDevice> { + match self { + Self::VfioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn virtio_pci_device_ref(&self) -> Option<&VirtioPciDevice> { + match self { + Self::VirtioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn virtio_pci_device_mut(&mut self) -> Option<&mut VirtioPciDevice> { + match self { + Self::VirtioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn pci_device_ref(&self) -> Option<&dyn PciDevice> { + match self { + Self::VfioPciDevice(x) => Some(x), + Self::VirtioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn pci_device_mut(&mut self) -> Option<&mut dyn PciDevice> { + match self { + Self::VfioPciDevice(x) => Some(x), + Self::VirtioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn pci_config_io_ref(&self) -> Option<&PciConfigIo> { + match self { + Self::PioPciBus(x) => Some(x), + _ => None, + } + } + pub fn pci_config_io_mut(&mut self) -> Option<&mut PciConfigIo> { + match self { + Self::PioPciBus(x) => Some(x), + _ => None, + } + } + pub fn pci_config_mmio_ref(&self) -> Option<&PciConfigMmio> { + match self { + Self::MmioPciBus(x) => Some(x), + _ => None, + } + } + pub fn pci_config_mmio_mut(&mut self) -> Option<&mut PciConfigMmio> { + match self { + Self::MmioPciBus(x) => Some(x), + _ => None, + } + } - pub fn read(&mut self, offset: u64, data: &mut [u8]) { + pub fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { match self { Self::I8042Device(x) => x.bus_read(offset, data), #[cfg(target_arch = "aarch64")] @@ -174,6 +249,10 @@ impl BusDevice { Self::BootTimer(x) => x.bus_read(offset, data), Self::MmioTransport(x) => x.bus_read(offset, data), Self::Serial(x) => x.bus_read(offset, data), + Self::VfioPciDevice(x) => x.bus_read(base, offset, data), + Self::VirtioPciDevice(x) => x.bus_read(base, offset, data), + Self::MmioPciBus(x) => x.bus_read(base, offset, data), + Self::PioPciBus(x) => x.bus_read(base, offset, data), #[cfg(test)] Self::Dummy(x) => x.bus_read(offset, data), #[cfg(test)] @@ -181,7 +260,7 @@ impl BusDevice { } } - pub fn write(&mut self, offset: u64, data: &[u8]) { + pub fn write(&mut self, base: u64, offset: u64, data: &[u8]) { match self { Self::I8042Device(x) => x.bus_write(offset, data), #[cfg(target_arch = "aarch64")] @@ -189,6 +268,10 @@ impl BusDevice { Self::BootTimer(x) => x.bus_write(offset, data), Self::MmioTransport(x) => x.bus_write(offset, data), Self::Serial(x) => x.bus_write(offset, data), + Self::VfioPciDevice(x) => x.bus_write(base, offset, data), + Self::VirtioPciDevice(x) => x.bus_write(base, offset, data), + Self::MmioPciBus(x) => x.bus_write(base, offset, data), + Self::PioPciBus(x) => x.bus_write(base, offset, data), #[cfg(test)] Self::Dummy(x) => x.bus_write(offset, data), #[cfg(test)] @@ -197,6 +280,90 @@ impl BusDevice { } } +// TODO: hack to make pci crate compatible with firecracker BusDevices +type PciDeviceResult = Result; +impl PciDevice for BusDevice { + fn allocate_bars( + &mut self, + allocator: &Arc>, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> PciDeviceResult> { + self.pci_device_mut() + .unwrap() + .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources) + } + + fn free_bars( + &mut self, + allocator: &mut SystemAllocator, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> PciDeviceResult<()> { + self.pci_device_mut() + .unwrap() + .free_bars(allocator, mmio32_allocator, mmio64_allocator) + } + + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.pci_device_mut() + .unwrap() + .write_config_register(reg_idx, offset, data) + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.pci_device_mut() + .unwrap() + .read_config_register(reg_idx) + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.pci_device_mut() + .unwrap() + .detect_bar_reprogramming(reg_idx, data) + } + + fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.pci_device_mut() + .unwrap() + .read_bar(base, offset, data) + } + + fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.pci_device_mut() + .unwrap() + .write_bar(base, offset, data) + } + + fn move_bar(&mut self, old_base: u64, new_base: u64) -> std::result::Result<(), std::io::Error> { + self.pci_device_mut() + .unwrap() + .move_bar(old_base, new_base) + } + + fn as_any(&mut self) -> &mut dyn Any { + self.pci_device_mut() + .unwrap() + .as_any() + } + + fn id(&self) -> Option { + self.pci_device_ref() + .unwrap() + .id() + } +} + impl MutEventSubscriber for BusDevice { fn process(&mut self, event: Events, ops: &mut EventOps) { match self { @@ -216,26 +383,25 @@ impl Bus { /// Constructs an a bus with an empty address space. pub fn new() -> Bus { Bus { - devices: BTreeMap::new(), + devices: Arc::new(RwLock::new(BTreeMap::new())), } } - fn first_before(&self, addr: u64) -> Option<(BusRange, &Mutex)> { + fn first_before(&self, addr: u64) -> Option<(BusRange, Arc>)> { // for when we switch to rustc 1.17: self.devices.range(..addr).iter().rev().next() - for (range, dev) in self.devices.iter().rev() { + for (range, dev) in self.devices.read().unwrap().iter().rev() { if range.0 <= addr { - return Some((*range, dev)); + return Some((*range, dev.clone())); } } None } - /// Returns the device found at some address. - pub fn get_device(&self, addr: u64) -> Option<(u64, &Mutex)> { + pub fn get_device(&self, addr: u64) -> Option<(u64, u64, Arc>)> { if let Some((BusRange(start, len), dev)) = self.first_before(addr) { let offset = addr - start; if offset < len { - return Some((offset, dev)); + return Some((start, offset, dev)); } } None @@ -243,7 +409,7 @@ impl Bus { /// Puts the given device at the given address space. pub fn insert( - &mut self, + &self, device: Arc>, base: u64, len: u64, @@ -269,22 +435,30 @@ impl Bus { } } - if self.devices.insert(BusRange(base, len), device).is_some() { + if self.devices.write().unwrap().insert(BusRange(base, len), device).is_some() { return Err(BusError::Overlap); } Ok(()) } + pub fn remove(&self, base: u64, len: u64) -> Result<(), BusError> { + let range = BusRange(base, len); + if self.devices.write().unwrap().remove(&range).is_none() { + return Err(BusError::MissingAddressRange); + } + Ok(()) + } + /// Reads data from the device that owns the range containing `addr` and puts it into `data`. /// /// Returns true on success, otherwise `data` is untouched. pub fn read(&self, addr: u64, data: &mut [u8]) -> bool { - if let Some((offset, dev)) = self.get_device(addr) { + if let Some((base, offset, dev)) = self.get_device(addr) { // OK to unwrap as lock() failing is a serious error condition and should panic. dev.lock() .expect("Failed to acquire device lock") - .read(offset, data); + .read(base, offset, data); true } else { false @@ -295,16 +469,39 @@ impl Bus { /// /// Returns true on success, otherwise `data` is untouched. pub fn write(&self, addr: u64, data: &[u8]) -> bool { - if let Some((offset, dev)) = self.get_device(addr) { + if let Some((base, offset, dev)) = self.get_device(addr) { // OK to unwrap as lock() failing is a serious error condition and should panic. dev.lock() .expect("Failed to acquire device lock") - .write(offset, data); + .write(base, offset, data); true } else { false } } + + /// Updates the address range for an existing device. + pub fn update_range( + &self, + old_base: u64, + old_len: u64, + new_base: u64, + new_len: u64, + ) -> Result<(), BusError> { + // Retrieve the device corresponding to the range + let device = if let Some((_, _, dev)) = self.get_device(old_base) { + dev.clone() + } else { + return Err(BusError::MissingAddressRange); + }; + + // Remove the old address range + self.remove(old_base, old_len)?; + + // Insert the new address range + self.insert(device, new_base, new_len) + } + } #[cfg(test)] diff --git a/src/vmm/src/devices/legacy/rtc_pl031.rs b/src/vmm/src/devices/legacy/rtc_pl031.rs index 15e20f81446..e0fc7aec375 100644 --- a/src/vmm/src/devices/legacy/rtc_pl031.rs +++ b/src/vmm/src/devices/legacy/rtc_pl031.rs @@ -77,7 +77,7 @@ impl std::ops::DerefMut for RTCDevice { // Implements Bus functions for AMBA PL031 RTC device impl RTCDevice { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { + pub fn bus_read(&mut self, _: u64, offset: u64, data: &mut [u8]) { if let (Ok(offset), 4) = (u16::try_from(offset), data.len()) { // read() function from RTC implementation expects a slice of // len 4, and we just validated that this is the data lengt @@ -92,7 +92,7 @@ impl RTCDevice { } } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + pub fn bus_write(&mut self, _: u64, offset: u64, data: &[u8]) { if let (Ok(offset), 4) = (u16::try_from(offset), data.len()) { // write() function from RTC implementation expects a slice of // len 4, and we just validated that this is the data length diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 0ca445b6f82..eb670a0d47c 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -14,6 +14,7 @@ pub mod bus; pub mod legacy; pub mod pseudo; pub mod virtio; +pub mod pci_segment; pub use bus::{Bus, BusDevice, BusError}; use log::error; diff --git a/src/vmm/src/devices/pci_segment.rs b/src/vmm/src/devices/pci_segment.rs new file mode 100644 index 00000000000..bc1cb561405 --- /dev/null +++ b/src/vmm/src/devices/pci_segment.rs @@ -0,0 +1,474 @@ +// Portions Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 - 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +// + +use std::{fmt::Debug, io, sync::{Arc, Mutex}}; + +use acpi_tables::{aml::{self, AmlError}, Aml}; +use anyhow::{anyhow, Result}; +use pci::{DeviceRelocation, PciBarRegionType, PciBdf, PciBus, PciConfigMmio, PciDevice, PciRoot}; +#[cfg(target_arch = "x86_64")] +use pci::{PciConfigIo, PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE}; +use uuid::Uuid; +use vm_system_allocator::{AddressAllocator, SystemAllocator}; + +use crate::{arch::x86_64::layout, devices::BusDevice, logger::{error, info}}; + +use super::Bus; + +pub(crate) struct PciSegment { + pub(crate) id: u16, + pub(crate) pci_bus: Arc>, + pub(crate) pci_config_mmio: Arc>, + pub(crate) mmio_config_address: u64, + pub(crate) proximity_domain: u32, + + #[cfg(target_arch = "x86_64")] + pub(crate) pci_config_io: Option>>, + + // Bitmap of PCI devices to hotplug. + pub(crate) pci_devices_up: u32, + // Bitmap of PCI devices to hotunplug. + pub(crate) pci_devices_down: u32, + // List of allocated IRQs for each PCI slot. + pub(crate) pci_irq_slots: [u8; 32], + + // Device memory covered by this segment + pub(crate) start_of_mem32_area: u64, + pub(crate) end_of_mem32_area: u64, + + pub(crate) start_of_mem64_area: u64, + pub(crate) end_of_mem64_area: u64, + + pub(crate) mem32_allocator: Arc>, + pub(crate) mem64_allocator: Arc>, +} + +impl Debug for PciSegment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciSegment") + .field("id", &self.id) + .finish() + } +} + +impl PciSegment { + pub(crate) fn new( + id: u16, + numa_node: u32, + mem32_allocator: Arc>, + mem64_allocator: Arc>, + mmio_bus: &mut Bus, + pci_irq_slots: &[u8; 32], + device_relocation: Arc, + ) -> Result { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new( + pci_root, + device_relocation, + ))); + + let pci_config_mmio = Arc::new(Mutex::new(BusDevice::MmioPciBus(PciConfigMmio::new(Arc::clone(&pci_bus))))); + let mmio_config_address = + layout::PCI_MMCONFIG_START + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + mmio_bus + .insert( + pci_config_mmio.clone(), + mmio_config_address, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + ) + .map_err(|e| anyhow!("error adding pci bus to mmio bus {e}"))?; + + let start_of_mem32_area = mem32_allocator.lock().unwrap().base().0; + let end_of_mem32_area = mem32_allocator.lock().unwrap().end().0; + + let start_of_mem64_area = mem64_allocator.lock().unwrap().base().0; + let end_of_mem64_area = mem64_allocator.lock().unwrap().end().0; + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: numa_node, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + mem32_allocator, + mem64_allocator, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + info!( + "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}, mem64 area [0x{:x}-0x{:x}", + segment.id, segment.mmio_config_address, segment.start_of_mem32_area, segment.end_of_mem32_area, segment.start_of_mem64_area, segment.end_of_mem64_area + ); + Ok(segment) + } + + // #[cfg(target_arch = "x86_64")] + // pub(crate) fn new_default_segment( + // mem32_allocator: Arc>, + // mem64_allocator: Arc>, + // mmio_bus: &mut Bus, + // io_bus: &mut Bus, + // pci_irq_slots: &[u8; 32], + // ) -> Result { + // let mut segment = Self::new( + // 0, + // 0, + // mem32_allocator, + // mem64_allocator, + // mmio_bus, + // pci_irq_slots, + // )?; + // let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&segment.pci_bus))))); + + // io_bus + // .insert( + // pci_config_io.clone(), + // PCI_CONFIG_IO_PORT, + // PCI_CONFIG_IO_PORT_SIZE, + // ) + // .map_err(|e| anyhow!("error adding pci bus to pio bus {e}"))?; + + // segment.pci_config_io = Some(pci_config_io); + + // Ok(segment) + // } + + // #[cfg(target_arch = "aarch64")] + // pub(crate) fn new_default_segment( + // address_manager: &Arc, + // mem32_allocator: Arc>, + // mem64_allocator: Arc>, + // pci_irq_slots: &[u8; 32], + // ) -> DeviceManagerResult { + // Self::new( + // 0, + // 0, + // address_manager, + // mem32_allocator, + // mem64_allocator, + // pci_irq_slots, + // ) + // } + + pub(crate) fn next_device_bdf(&self) -> Result { + Ok(PciBdf::new( + self.id, + 0, + self.pci_bus + .lock() + .unwrap() + .next_device_id() + .map_err(|_e| anyhow!("error adding getting device id"))? as u8, + 0, + )) + } + + pub fn reserve_legacy_interrupts_for_pci_devices( + allocator: &Arc>, + pci_irq_slots: &mut [u8; 32], + ) -> Result<()> { + // Reserve 8 IRQs which will be shared across all PCI devices. + let num_irqs = 8; + let mut irqs: Vec = Vec::new(); + for _ in 0..num_irqs { + irqs.push( + allocator + .lock() + .unwrap() + .allocate_irq() + .ok_or(anyhow!("error allocating irq"))? as u8, + ); + } + + // There are 32 devices on the PCI bus, let's assign them an IRQ. + for i in 0..32 { + pci_irq_slots[i] = irqs[i % num_irqs]; + } + + Ok(()) + } +} + +struct PciDevSlot { + device_id: u8, +} + +impl Aml for PciDevSlot { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), AmlError> { + let sun = self.device_id; + let adr: u32 = (self.device_id as u32) << 16; + aml::Device::new( + format!("S{:03}", self.device_id).as_str().try_into()?, + vec![ + &aml::Name::new("_SUN".try_into()?, &sun)?, + &aml::Name::new("_ADR".try_into()?, &adr)?, + &aml::Method::new( + "_EJ0".try_into()?, + 1, + true, + vec![&aml::MethodCall::new( + "\\_SB_.PHPR.PCEJ".try_into()?, + vec![&aml::Path::new("_SUN")?, &aml::Path::new("_SEG")?], + )], + ), + ], + ) + .append_aml_bytes(v) + } +} + +struct PciDevSlotNotify { + device_id: u8, +} + +impl Aml for PciDevSlotNotify { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), AmlError> { + let device_id_mask: u32 = 1 << self.device_id; + let object = aml::Path::new(&format!("S{:03}", self.device_id))?; + aml::And::new(&aml::Local(0), &aml::Arg(0), &device_id_mask).append_aml_bytes(v); + aml::If::new( + &aml::Equal::new(&aml::Local(0), &device_id_mask), + vec![&aml::Notify::new(&object, &aml::Arg(1))], + ) + .append_aml_bytes(v) + } +} + +struct PciDevSlotMethods {} + +impl Aml for PciDevSlotMethods { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), AmlError> { + let mut device_notifies = Vec::new(); + for device_id in 0..32 { + device_notifies.push(PciDevSlotNotify { device_id }); + } + + let mut device_notifies_refs: Vec<&dyn Aml> = Vec::new(); + for device_notify in device_notifies.iter() { + device_notifies_refs.push(device_notify); + } + + aml::Method::new("DVNT".try_into()?, 2, true, device_notifies_refs).append_aml_bytes(v)?; + aml::Method::new( + "PCNT".try_into()?, + 0, + true, + vec![ + &aml::Acquire::new("\\_SB_.PHPR.BLCK".try_into()?, 0xffff), + &aml::Store::new(&aml::Path::new("\\_SB_.PHPR.PSEG")?, &aml::Path::new("_SEG")?), + &aml::MethodCall::new( + "DVNT".try_into()?, + vec![&aml::Path::new("\\_SB_.PHPR.PCIU")?, &aml::ONE], + ), + &aml::MethodCall::new( + "DVNT".try_into()?, + vec![&aml::Path::new("\\_SB_.PHPR.PCID")?, &3usize], + ), + &aml::Release::new("\\_SB_.PHPR.BLCK".try_into()?), + ], + ) + .append_aml_bytes(v) + } +} + +struct PciDsmMethod {} + +impl Aml for PciDsmMethod { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), AmlError> { + // Refer to ACPI spec v6.3 Ch 9.1.1 and PCI Firmware spec v3.3 Ch 4.6.1 + // _DSM (Device Specific Method), the following is the implementation in ASL. + /* + Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method + { + If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling Interface */)) + { + If ((Arg2 == Zero)) + { + Return (Buffer (One) { 0x21 }) + } + If ((Arg2 == 0x05)) + { + Return (Zero) + } + } + + Return (Buffer (One) { 0x00 }) + } + */ + /* + * As per ACPI v6.3 Ch 19.6.142, the UUID is required to be in mixed endian: + * Among the fields of a UUID: + * {d1 (8 digits)} - {d2 (4 digits)} - {d3 (4 digits)} - {d4 (16 digits)} + * d1 ~ d3 need to be little endian, d4 be big endian. + * See https://en.wikipedia.org/wiki/Universally_unique_identifier#Encoding . + */ + let uuid = Uuid::parse_str("E5C937D0-3553-4D7A-9117-EA4D19C3434D").unwrap(); + let (uuid_d1, uuid_d2, uuid_d3, uuid_d4) = uuid.as_fields(); + let mut uuid_buf = vec![]; + uuid_buf.extend(uuid_d1.to_le_bytes()); + uuid_buf.extend(uuid_d2.to_le_bytes()); + uuid_buf.extend(uuid_d3.to_le_bytes()); + uuid_buf.extend(uuid_d4); + aml::Method::new( + "_DSM".try_into()?, + 4, + false, + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(0), &aml::Buffer::new(uuid_buf)), + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &aml::ZERO), + vec![&aml::Return::new(&aml::Buffer::new(vec![0x21]))], + ), + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &0x05u8), + vec![&aml::Return::new(&aml::ZERO)], + ), + ], + ), + &aml::Return::new(&aml::Buffer::new(vec![0])), + ], + ) + .append_aml_bytes(v) + } +} + +impl Aml for PciSegment { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), AmlError> { + let mut pci_dsdt_inner_data: Vec<&dyn Aml> = Vec::new(); + let hid = aml::Name::new("_HID".try_into()?, &aml::EisaName::new("PNP0A08")?)?; + pci_dsdt_inner_data.push(&hid); + let cid = aml::Name::new("_CID".try_into()?, &aml::EisaName::new("PNP0A03")?)?; + pci_dsdt_inner_data.push(&cid); + let adr = aml::Name::new("_ADR".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&adr); + let seg = aml::Name::new("_SEG".try_into()?, &self.id)?; + pci_dsdt_inner_data.push(&seg); + let uid = aml::Name::new("_UID".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&uid); + let cca = aml::Name::new("_CCA".try_into()?, &aml::ONE)?; + pci_dsdt_inner_data.push(&cca); + let supp = aml::Name::new("SUPP".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&supp); + + let proximity_domain = self.proximity_domain; + let pxm_return = aml::Return::new(&proximity_domain); + let pxm = aml::Method::new("_PXM".try_into()?, 0, false, vec![&pxm_return]); + pci_dsdt_inner_data.push(&pxm); + + let pci_dsm = PciDsmMethod {}; + pci_dsdt_inner_data.push(&pci_dsm); + + #[allow(clippy::if_same_then_else)] + let crs = if self.id == 0 { + aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16)?, + #[cfg(target_arch = "x86_64")] + &aml::Io::new(0xcf8, 0xcf8, 1, 0x8), + &aml::Memory32Fixed::new( + true, + self.mmio_config_address as u32, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT as u32, + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + )?, + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + )?, + #[cfg(target_arch = "x86_64")] + &aml::AddressSpace::new_io(0u16, 0x0cf7u16)?, + #[cfg(target_arch = "x86_64")] + &aml::AddressSpace::new_io(0x0d00u16, 0xffffu16)?, + ]), + )? + } else { + aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16)?, + &aml::Memory32Fixed::new( + true, + self.mmio_config_address as u32, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT as u32, + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + )?, + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + )?, + ]), + )? + }; + pci_dsdt_inner_data.push(&crs); + + let mut pci_devices = Vec::new(); + for device_id in 0..32 { + let pci_device = PciDevSlot { device_id }; + pci_devices.push(pci_device); + } + for pci_device in pci_devices.iter() { + pci_dsdt_inner_data.push(pci_device); + } + + let pci_device_methods = PciDevSlotMethods {}; + pci_dsdt_inner_data.push(&pci_device_methods); + + // Build PCI routing table, listing IRQs assigned to PCI devices. + let prt_package_list: Vec<(u32, u32)> = self + .pci_irq_slots + .iter() + .enumerate() + .map(|(i, irq)| (((((i as u32) & 0x1fu32) << 16) | 0xffffu32), *irq as u32)) + .collect(); + let prt_package_list: Vec = prt_package_list + .iter() + .map(|(bdf, irq)| aml::Package::new(vec![bdf, &0u8, &0u8, irq])) + .collect(); + let prt_package_list: Vec<&dyn Aml> = prt_package_list + .iter() + .map(|item| item as &dyn Aml) + .collect(); + let prt = aml::Name::new("_PRT".try_into()?, &aml::Package::new(prt_package_list))?; + pci_dsdt_inner_data.push(&prt); + + aml::Device::new( + format!("_SB_.PC{:02X}", self.id).as_str().try_into()?, + pci_dsdt_inner_data, + ) + .append_aml_bytes(v) + } +} diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 697928ae9c6..f3e706719a1 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; +use std::sync::Arc; use std::time::Duration; use log::error; @@ -24,7 +25,7 @@ use super::{ VIRTIO_BALLOON_S_SWAP_OUT, }; use crate::devices::virtio::balloon::BalloonError; -use crate::devices::virtio::device::{IrqTrigger, IrqType}; +use crate::devices::virtio::device::{IrqTrigger, IrqType, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::gen::virtio_blk::VIRTIO_F_VERSION_1; use crate::logger::IncMetric; use crate::utils::u64_to_usize; @@ -161,7 +162,7 @@ pub struct Balloon { pub(crate) queues: Vec, pub(crate) queue_evts: [EventFd; BALLOON_NUM_QUEUES], pub(crate) device_state: DeviceState, - pub(crate) irq_trigger: IrqTrigger, + pub(crate) virtio_interrupt: Option>, // Implementation specific fields. pub(crate) restored: bool, @@ -188,7 +189,6 @@ impl fmt::Debug for Balloon { .field("queues", &self.queues) .field("queue_evts", &self.queue_evts) .field("device_state", &self.device_state) - .field("irq_trigger", &self.irq_trigger) .field("restored", &self.restored) .field("stats_polling_interval_s", &self.stats_polling_interval_s) .field("stats_desc_index", &self.stats_desc_index) @@ -242,7 +242,7 @@ impl Balloon { }, queue_evts, queues, - irq_trigger: IrqTrigger::new().map_err(BalloonError::EventFd)?, + virtio_interrupt: Some(Arc::new(IrqTrigger::new().map_err(BalloonError::EventFd)?)), device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, restored, @@ -363,7 +363,7 @@ impl Balloon { } if needs_interrupt { - self.signal_used_queue()?; + self.signal_used_queue(INFLATE_INDEX)?; } Ok(()) @@ -381,7 +381,7 @@ impl Balloon { } if needs_interrupt { - self.signal_used_queue() + self.signal_used_queue(DEFLATE_INDEX) } else { Ok(()) } @@ -425,11 +425,13 @@ impl Balloon { Ok(()) } - pub(crate) fn signal_used_queue(&self) -> Result<(), BalloonError> { - self.irq_trigger.trigger_irq(IrqType::Vring).map_err(|err| { - METRICS.event_fails.inc(); - BalloonError::InterruptError(err) - }) + pub(crate) fn signal_used_queue(&self, queue_index: usize) -> Result<(), BalloonError> { + self.virtio_interrupt.as_ref().expect("queue should be initialized") + .trigger(VirtioInterruptType::Queue(queue_index as u16)).map_err(|err| { + METRICS.event_fails.inc(); + BalloonError::InterruptError(err) + } + ) } /// Process device virtio queue(s). @@ -450,7 +452,7 @@ impl Balloon { self.queues[STATS_INDEX] .add_used(index, 0) .map_err(BalloonError::Queue)?; - self.signal_used_queue() + self.signal_used_queue(STATS_INDEX) } else { error!("Failed to update balloon stats, missing descriptor."); Ok(()) @@ -461,8 +463,8 @@ impl Balloon { pub fn update_size(&mut self, amount_mib: u32) -> Result<(), BalloonError> { if self.is_activated() { self.config_space.num_pages = mib_to_pages(amount_mib)?; - self.irq_trigger - .trigger_irq(IrqType::Config) + self.virtio_interrupt.as_ref().expect("queue should be initialized") + .trigger(VirtioInterruptType::Config) .map_err(BalloonError::InterruptError) } else { Err(BalloonError::DeviceNotActive) @@ -573,8 +575,8 @@ impl VirtioDevice for Balloon { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("queue should be initialized").clone() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -601,7 +603,8 @@ impl VirtioDevice for Balloon { dst.copy_from_slice(data); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -816,7 +819,7 @@ pub(crate) mod tests { // Only initialize the inflate queue to demonstrate invalid request handling. let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), None).unwrap(); // Fill the second page with non-zero bytes. for i in 0..0x1000 { @@ -874,7 +877,7 @@ pub(crate) mod tests { let mem = default_mem(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), None).unwrap(); // Fill the third page with non-zero bytes. for i in 0..0x1000 { @@ -944,7 +947,7 @@ pub(crate) mod tests { let mem = default_mem(); let defq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(DEFLATE_INDEX, defq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), None).unwrap(); let page_addr = 0x10; @@ -992,7 +995,7 @@ pub(crate) mod tests { let mem = default_mem(); let statsq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(STATS_INDEX, statsq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), None).unwrap(); let page_addr = 0x100; @@ -1068,7 +1071,7 @@ pub(crate) mod tests { assert!(balloon.stats_desc_index.is_some()); balloon.process_stats_timer_event().unwrap(); assert!(balloon.stats_desc_index.is_none()); - assert!(balloon.irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(balloon.irq_trigger.has_pending_irq(IrqType::Vring)); }); } } @@ -1083,7 +1086,7 @@ pub(crate) mod tests { balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, defq.create_queue()); - balloon.activate(mem).unwrap(); + balloon.activate(mem, None).unwrap(); balloon.process_virtio_queues() } @@ -1091,7 +1094,7 @@ pub(crate) mod tests { fn test_update_stats_interval() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); - balloon.activate(mem).unwrap(); + balloon.activate(mem, None).unwrap(); assert_eq!( format!("{:?}", balloon.update_stats_polling_interval(1)), "Err(StatisticsStateChange)" @@ -1100,7 +1103,7 @@ pub(crate) mod tests { let mut balloon = Balloon::new(0, true, 1, false).unwrap(); let mem = default_mem(); - balloon.activate(mem).unwrap(); + balloon.activate(mem, None).unwrap(); assert_eq!( format!("{:?}", balloon.update_stats_polling_interval(0)), "Err(StatisticsStateChange)" diff --git a/src/vmm/src/devices/virtio/balloon/event_handler.rs b/src/vmm/src/devices/virtio/balloon/event_handler.rs index 3019d6877de..fd75f466a3b 100644 --- a/src/vmm/src/devices/virtio/balloon/event_handler.rs +++ b/src/vmm/src/devices/virtio/balloon/event_handler.rs @@ -177,7 +177,7 @@ pub mod tests { } // Now activate the device. - balloon.lock().unwrap().activate(mem.clone()).unwrap(); + balloon.lock().unwrap().activate(mem.clone(), None).unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 4e768ddd2e2..1f051262f01 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -138,8 +138,8 @@ impl Persist<'_> for Balloon { FIRECRACKER_MAX_QUEUE_SIZE, ) .map_err(|_| Self::Error::QueueRestoreError)?; - balloon.irq_trigger.irq_status = - Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); + // balloon.irq_trigger.irq_status = + // Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); balloon.avail_features = state.virtio_state.avail_features; balloon.acked_features = state.virtio_state.acked_features; balloon.latest_stats = state.latest_stats.create_stats(); diff --git a/src/vmm/src/devices/virtio/balloon/test_utils.rs b/src/vmm/src/devices/virtio/balloon/test_utils.rs index 8968aa70915..a7cdbed23e6 100644 --- a/src/vmm/src/devices/virtio/balloon/test_utils.rs +++ b/src/vmm/src/devices/virtio/balloon/test_utils.rs @@ -23,7 +23,7 @@ pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { _ => unreachable!(), }; // Validate the queue operation finished successfully. - assert!(b.irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(b.irq_trigger.has_pending_irq(IrqType::Vring)); } pub fn set_request(queue: &VirtQueue, idx: u16, addr: u64, len: u32, flags: u16) { diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 7399fe39a0b..5d11c6cbf41 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -1,6 +1,8 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; + use event_manager::{EventOps, Events, MutEventSubscriber}; use vmm_sys_util::eventfd::EventFd; @@ -8,7 +10,7 @@ use super::persist::{BlockConstructorArgs, BlockState}; use super::vhost_user::device::{VhostUserBlock, VhostUserBlockConfig}; use super::virtio::device::{VirtioBlock, VirtioBlockConfig}; use super::BlockError; -use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; +use crate::devices::virtio::device::{IrqTrigger, VirtioDevice, VirtioInterrupt}; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::rate_limiter::BucketUpdate; @@ -173,10 +175,10 @@ impl VirtioDevice for Block { } } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt(&self) -> Arc { match self { - Self::Virtio(b) => &b.irq_trigger, - Self::VhostUser(b) => &b.irq_trigger, + Self::Virtio(b) => b.interrupt(), + Self::VhostUser(b) => b.interrupt(), } } @@ -194,10 +196,10 @@ impl VirtioDevice for Block { } } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { match self { - Self::Virtio(b) => b.activate(mem), - Self::VhostUser(b) => b.activate(mem), + Self::Virtio(b) => b.activate(mem, virtio_interrupt), + Self::VhostUser(b) => b.activate(mem, virtio_interrupt), } } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 62218157c8b..4065311d4ea 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -14,6 +14,8 @@ use vmm_sys_util::eventfd::EventFd; use super::{VhostUserBlockError, NUM_QUEUES, QUEUE_SIZE}; use crate::devices::virtio::block::CacheType; +use crate::devices::virtio::device::VirtioInterrupt; +use crate::devices::virtio::device::VirtioInterruptType; use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; use crate::devices::virtio::gen::virtio_blk::{ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_F_VERSION_1, @@ -118,7 +120,7 @@ pub struct VhostUserBlockImpl { pub queues: Vec, pub queue_evts: [EventFd; u64_to_usize(NUM_QUEUES)], pub device_state: DeviceState, - pub irq_trigger: IrqTrigger, + pub virtio_interrupt: Option>, // Implementation specific fields. pub id: String, @@ -144,7 +146,6 @@ impl std::fmt::Debug for VhostUserBlockImpl { .field("queues", &self.queues) .field("queue_evts", &self.queue_evts) .field("device_state", &self.device_state) - .field("irq_trigger", &self.irq_trigger) .field("id", &self.id) .field("partuuid", &self.partuuid) .field("cache_type", &self.cache_type) @@ -204,7 +205,7 @@ impl VhostUserBlockImpl { let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VhostUserBlockError::EventFd)?; u64_to_usize(NUM_QUEUES)]; let device_state = DeviceState::Inactive; - let irq_trigger = IrqTrigger::new().map_err(VhostUserBlockError::IrqTrigger)?; + let virtio_interrupt: Option> = Some(Arc::new(IrqTrigger::new().map_err(VhostUserBlockError::IrqTrigger)?)); // We negotiated features with backend. Now these acked_features // are available for guest driver to choose from. @@ -226,7 +227,7 @@ impl VhostUserBlockImpl { queues, queue_evts, device_state, - irq_trigger, + virtio_interrupt, id: config.drive_id, partuuid: config.partuuid, @@ -271,8 +272,9 @@ impl VhostUserBlockImpl { ) .map_err(VhostUserBlockError::Vhost)?; self.config_space = new_config_space; - self.irq_trigger - .trigger_irq(IrqType::Config) + self.virtio_interrupt.as_ref() + .expect("interrupt must be set up") + .trigger(VirtioInterruptType::Config) .map_err(VhostUserBlockError::IrqTrigger)?; let delta_us = get_time_us(ClockType::Monotonic) - start_time; @@ -311,8 +313,8 @@ impl VirtioDevice for VhostUserBlock &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -331,7 +333,9 @@ impl VirtioDevice for VhostUserBlock // Other block config fields are immutable. } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); + for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -346,7 +350,7 @@ impl VirtioDevice for VhostUserBlock self.vu_handle.setup_backend( &mem, &[(0, &self.queues[0], &self.queue_evts[0])], - &self.irq_trigger, + self.interrupt(), ) }) .map_err(|err| { @@ -376,7 +380,7 @@ mod tests { use super::*; use crate::devices::virtio::block::virtio::device::FileEngineType; - use crate::devices::virtio::mmio::VIRTIO_MMIO_INT_CONFIG; + use crate::devices::virtio::transport::mmio::VIRTIO_MMIO_INT_CONFIG; use crate::test_utils::create_tmp_socket; use crate::vstate::memory::{FileOffset, GuestAddress, GuestMemoryExtension}; @@ -786,7 +790,7 @@ mod tests { let guest_memory = GuestMemoryMmap::from_raw_regions_file(regions, false, false).unwrap(); // During actiavion of the device features, memory and queues should be set and activated. - vhost_block.activate(guest_memory).unwrap(); + vhost_block.activate(guest_memory, None).unwrap(); assert!(unsafe { *vhost_block.vu_handle.vu.features_are_set.get() }); assert!(unsafe { *vhost_block.vu_handle.vu.memory_is_set.get() }); assert!(unsafe { *vhost_block.vu_handle.vu.vring_enabled.get() }); diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index fd352fe2539..1106280dbea 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -25,14 +25,14 @@ use super::{ }; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; use crate::devices::virtio::block::CacheType; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::gen::virtio_blk::{ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, VIRTIO_F_VERSION_1, }; use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; -use crate::logger::{error, warn, IncMetric}; +use crate::logger::{error, warn, debug, IncMetric}; use crate::rate_limiter::{BucketUpdate, RateLimiter}; use crate::utils::u64_to_usize; use crate::vmm_config::drive::BlockDeviceConfig; @@ -253,7 +253,7 @@ pub struct VirtioBlock { pub queues: Vec, pub queue_evts: [EventFd; 1], pub device_state: DeviceState, - pub irq_trigger: IrqTrigger, + pub virtio_interrupt: Option>, // Implementation specific fields. pub id: String, @@ -322,7 +322,7 @@ impl VirtioBlock { queues, queue_evts, device_state: DeviceState::Inactive, - irq_trigger: IrqTrigger::new().map_err(VirtioBlockError::IrqTrigger)?, + virtio_interrupt: Some(Arc::new(IrqTrigger::new().map_err(VirtioBlockError::IrqTrigger)?)), id: config.drive_id.clone(), partuuid: config.partuuid, @@ -385,10 +385,11 @@ impl VirtioBlock { } fn add_used_descriptor( + queue_index: usize, queue: &mut Queue, index: u16, len: u32, - irq_trigger: &IrqTrigger, + interrupt: Arc, block_metrics: &BlockDeviceMetrics, ) { queue.add_used(index, len).unwrap_or_else(|err| { @@ -396,7 +397,7 @@ impl VirtioBlock { }); if queue.prepare_kick() { - irq_trigger.trigger_irq(IrqType::Vring).unwrap_or_else(|_| { + interrupt.trigger(VirtioInterruptType::Queue(queue_index as u16)).unwrap_or_else(|_| { block_metrics.event_fails.inc(); }); } @@ -444,10 +445,11 @@ impl VirtioBlock { } ProcessingResult::Executed(finished) => { Self::add_used_descriptor( + queue_index, queue, head.index, finished.num_bytes_to_mem, - &self.irq_trigger, + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone(), &self.metrics, ); } @@ -470,7 +472,8 @@ impl VirtioBlock { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); - let queue = &mut self.queues[0]; + let queue_index = 0; + let queue = &mut self.queues[queue_index]; loop { match engine.pop(mem) { @@ -495,10 +498,11 @@ impl VirtioBlock { let finished = pending.finish(mem, res, &self.metrics); Self::add_used_descriptor( + queue_index, queue, finished.desc_idx, finished.num_bytes_to_mem, - &self.irq_trigger, + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone(), &self.metrics, ); } @@ -527,7 +531,7 @@ impl VirtioBlock { self.config_space = self.disk.virtio_block_config_space(); // Kick the driver to pick up the changes. - self.irq_trigger.trigger_irq(IrqType::Config).unwrap(); + self.virtio_interrupt.as_ref().expect("interrupt must be set up").trigger(VirtioInterruptType::Config).unwrap(); self.metrics.update_count.inc(); Ok(()) @@ -594,8 +598,8 @@ impl VirtioDevice for VirtioBlock { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone() } fn read_config(&self, offset: u64, mut data: &mut [u8]) { @@ -629,7 +633,9 @@ impl VirtioDevice for VirtioBlock { dst.copy_from_slice(data); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); + for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -647,6 +653,7 @@ impl VirtioDevice for VirtioBlock { return Err(ActivateError::EventFd); } self.device_state = DeviceState::Activated(mem); + debug!("VirtioBlock activated"); Ok(()) } @@ -866,7 +873,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -894,7 +901,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -957,7 +964,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1008,7 +1015,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1040,7 +1047,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); vq.dtable[1].set(0xf000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); @@ -1076,7 +1083,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1123,7 +1130,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1362,7 +1369,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); vq.dtable[1].set(0xff00, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); @@ -1403,7 +1410,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1449,7 +1456,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1572,7 +1579,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, IO_URING_NUM_ENTRIES * 4); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); // Run scenario that doesn't trigger FullSq BlockError: Add sq_size flush requests. add_flush_requests_batch(&mut block, &vq, IO_URING_NUM_ENTRIES); @@ -1605,7 +1612,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, IO_URING_NUM_ENTRIES * 4); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); // Run scenario that triggers FullCqError. Push 2 * IO_URING_NUM_ENTRIES and wait for // completion. Then try to push another entry. @@ -1634,7 +1641,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); // Add a batch of flush requests. add_flush_requests_batch(&mut block, &vq, 5); @@ -1653,7 +1660,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1722,7 +1729,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); diff --git a/src/vmm/src/devices/virtio/block/virtio/event_handler.rs b/src/vmm/src/devices/virtio/block/virtio/event_handler.rs index 8400766e06b..52dd35838c0 100644 --- a/src/vmm/src/devices/virtio/block/virtio/event_handler.rs +++ b/src/vmm/src/devices/virtio/block/virtio/event_handler.rs @@ -162,7 +162,7 @@ mod tests { assert_eq!(ev_count, 0); // Now activate the device. - block.lock().unwrap().activate(mem.clone()).unwrap(); + block.lock().unwrap().activate(mem.clone(), None).unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 61bffbeaa40..5955b18a0e0 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -131,7 +131,7 @@ impl Persist<'_> for VirtioBlock { queues, queue_evts, device_state, - irq_trigger, + virtio_interrupt: Some(Arc::new(irq_trigger)), id: state.id.clone(), partuuid: state.partuuid.clone(), diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index 106da8177cd..8d902fcbd2a 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -81,9 +81,9 @@ pub fn simulate_queue_event(b: &mut VirtioBlock, maybe_expected_irq: Option std::result::Result<(), std::io::Error>; + fn notifier(&self, _int_type: VirtioInterruptType) -> Option { + None + } + // TODO hack to make it backwards compatible with IrqInterrupt + fn status(&self) -> Arc { + Arc::new(AtomicU32::new(0)) + } +} + +impl Debug for dyn VirtioInterrupt { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "VirtioInterrupt") + } +} /// Enum that indicates if a VirtioDevice is inactive or has been activated /// and memory attached to it. @@ -84,6 +105,23 @@ impl IrqTrigger { } } +impl VirtioInterrupt for IrqTrigger { + fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { + match int_type { + VirtioInterruptType::Config => self.trigger_irq(IrqType::Config), + VirtioInterruptType::Queue(_) => self.trigger_irq(IrqType::Vring), + } + } + + fn notifier(&self, _int_type: VirtioInterruptType) -> Option { + Some(self.irq_evt.try_clone().ok()?) + } + + fn status(&self) -> Arc { + self.irq_status.clone() + } +} + /// Trait for virtio devices to be driven by a virtio transport. /// /// The lifecycle of a virtio device is to be moved to a virtio transport, which will then query the @@ -121,10 +159,10 @@ pub trait VirtioDevice: AsAny + Send { /// Returns the current device interrupt status. fn interrupt_status(&self) -> Arc { - Arc::clone(&self.interrupt_trigger().irq_status) + self.interrupt().status().clone() } - fn interrupt_trigger(&self) -> &IrqTrigger; + fn interrupt(&self) -> Arc; /// The set of feature bits shifted by `page * 32`. fn avail_features_by_page(&self, page: u32) -> u32 { @@ -170,14 +208,14 @@ pub trait VirtioDevice: AsAny + Send { fn write_config(&mut self, offset: u64, data: &[u8]); /// Performs the formal activation for a device, which can be verified also with `is_activated`. - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError>; + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError>; /// Checks if the resources of this device are activated. fn is_activated(&self) -> bool; /// Optionally deactivates this device and returns ownership of the guest memory map, interrupt /// event, and queue events. - fn reset(&mut self) -> Option<(EventFd, Vec)> { + fn reset(&mut self) -> Option<(Arc, Vec)> { None } @@ -275,7 +313,7 @@ pub(crate) mod tests { todo!() } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt(&self) -> Arc { todo!() } @@ -287,7 +325,7 @@ pub(crate) mod tests { todo!() } - fn activate(&mut self, _mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, _mem: GuestMemoryMmap, _virtio_interrupt: Option>) -> Result<(), ActivateError> { todo!() } diff --git a/src/vmm/src/devices/virtio/mod.rs b/src/vmm/src/devices/virtio/mod.rs index 9931e1211d1..1f5bed67a7f 100644 --- a/src/vmm/src/devices/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/mod.rs @@ -18,12 +18,12 @@ pub mod device; pub mod gen; mod iov_deque; pub mod iovec; -pub mod mmio; pub mod net; pub mod persist; pub mod queue; pub mod rng; pub mod test_utils; +pub mod transport; pub mod vhost_user; pub mod vhost_user_metrics; pub mod vsock; diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 6c69bd171a7..1a1f6fb11c5 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -15,7 +15,7 @@ use log::error; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::gen::virtio_blk::VIRTIO_F_VERSION_1; use crate::devices::virtio::gen::virtio_net::{ virtio_net_hdr_v1, VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, @@ -248,7 +248,7 @@ pub struct Net { tx_frame_headers: [u8; frame_hdr_len()], - pub(crate) irq_trigger: IrqTrigger, + pub(crate) virtio_interrupt: Option>, pub(crate) config_space: ConfigSpace, pub(crate) guest_mac: Option, @@ -312,7 +312,7 @@ impl Net { tx_rate_limiter, rx_frame_buf: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], - irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?, + virtio_interrupt: Some(Arc::new(IrqTrigger::new().map_err(NetError::EventFd)?)), config_space, guest_mac, device_state: DeviceState::Inactive, @@ -391,14 +391,15 @@ impl Net { /// https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-320005 /// 2.6.7.1 Driver Requirements: Used Buffer Notification Suppression fn try_signal_queue(&mut self, queue_type: NetQueue) -> Result<(), DeviceError> { - let queue = match queue_type { - NetQueue::Rx => &mut self.queues[RX_INDEX], - NetQueue::Tx => &mut self.queues[TX_INDEX], + let queue_index = match queue_type { + NetQueue::Rx => RX_INDEX, + NetQueue::Tx => TX_INDEX, }; + let queue = &mut self.queues[queue_index]; if queue.prepare_kick() { - self.irq_trigger - .trigger_irq(IrqType::Vring) + self.virtio_interrupt.as_ref().expect("interrupt must be setup") + .trigger(VirtioInterruptType::Queue(queue_index as u16)) .map_err(|err| { self.metrics.event_fails.inc(); DeviceError::FailedSignalingIrq(err) @@ -962,8 +963,8 @@ impl VirtioDevice for Net { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone() } fn read_config(&self, offset: u64, data: &mut [u8]) { if let Some(config_space_bytes) = self.config_space.as_slice().get(u64_to_usize(offset)..) { @@ -993,7 +994,9 @@ impl VirtioDevice for Net { self.metrics.mac_address_updates.inc(); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); + for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -1392,7 +1395,7 @@ pub mod tests { // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 4); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // Check that the invalid descriptor chains have been discarded th.rxq.check_used_elem(0, 0, 0); th.rxq.check_used_elem(1, 3, 0); @@ -1449,7 +1452,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_descriptors == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // Check that the frame has been written successfully to the Rx descriptor chain. header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq @@ -1512,7 +1515,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // Check that the 1st frame was written successfully to the 1st Rx descriptor chain. header_set_num_buffers(frame_1.as_mut_slice(), 1); th.rxq @@ -1570,7 +1573,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // 2 chains should be used for the packet. header_set_num_buffers(frame.as_mut_slice(), 2); @@ -1635,7 +1638,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1658,7 +1661,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1685,7 +1688,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1708,7 +1711,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1747,7 +1750,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 4); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(3, 4, 0); // Check that the valid frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1778,7 +1781,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 3, 0); // Check that the frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1807,7 +1810,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); } @@ -1835,7 +1838,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); th.txq.check_used_elem(1, 3, 0); // Check that the first frame was sent to the tap. @@ -2182,7 +2185,7 @@ pub mod tests { assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2210,7 +2213,7 @@ pub mod tests { // validate the rate_limiter is no longer blocked assert!(!th.net().rx_rate_limiter.is_blocked()); // make sure the virtio queue operation completed this time - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2307,14 +2310,14 @@ pub mod tests { assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); // trigger the RX handler again, this time it should do the limiter fast path exit th.simulate_event(NetEvent::Tap); // assert that no operation actually completed, that the limiter blocked it - assert!(!&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(!&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2327,7 +2330,7 @@ pub mod tests { { th.simulate_event(NetEvent::RxRateLimiter); // make sure the virtio queue operation completed this time - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2397,7 +2400,7 @@ pub mod tests { assert_eq!(net.queue_events().len(), NET_QUEUE_SIZES.len()); // Test interrupts. - assert!(!&net.irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(!&net.irq_trigger.has_pending_irq(IrqType::Vring)); } #[test] diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index fb62dcb0abe..fa43256d88b 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -148,7 +148,7 @@ impl Persist<'_> for Net { NET_NUM_QUEUES, NET_QUEUE_MAX_SIZE, )?; - net.irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); + // net.irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index ffe7bbc7279..14108503914 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -372,7 +372,7 @@ pub mod test { } pub fn activate_net(&mut self) { - self.net.lock().unwrap().activate(self.mem.clone()).unwrap(); + self.net.lock().unwrap().activate(self.mem.clone(), None).unwrap(); // Process the activate event. let ev_count = self.event_manager.run_with_timeout(100).unwrap(); assert_eq!(ev_count, 1); @@ -449,7 +449,7 @@ pub mod test { old_used_descriptors + 1 ); - assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); frame } @@ -475,7 +475,7 @@ pub mod test { ); // Check that the expected frame was sent to the Rx queue eventually. assert_eq!(self.rxq.used.idx.get(), used_idx + 1); - assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); self.rxq .check_used_elem(used_idx, 0, expected_frame.len().try_into().unwrap()); self.rxq.dtable[0].check_data(expected_frame); diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 23293a25eab..27fd73670a0 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize}; use super::queue::QueueError; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX; -use crate::devices::virtio::mmio::MmioTransport; +use crate::devices::virtio::transport::MmioTransport; use crate::devices::virtio::queue::Queue; use crate::snapshot::Persist; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -261,7 +261,7 @@ mod tests { use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::test_utils::default_block_with_path; use crate::devices::virtio::block::virtio::VirtioBlock; - use crate::devices::virtio::mmio::tests::DummyDevice; + use crate::devices::virtio::transport::mmio::tests::DummyDevice; use crate::devices::virtio::net::test_utils::default_net; use crate::devices::virtio::net::Net; use crate::devices::virtio::test_utils::default_mem; diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index 39ccef1f1ee..b1d37a5a07f 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -265,7 +265,7 @@ impl Queue { pub fn new(max_size: u16) -> Queue { Queue { max_size, - size: 0, + size: max_size, ready: false, desc_table_address: GuestAddress(0), avail_ring_address: GuestAddress(0), @@ -692,6 +692,18 @@ impl Queue { new - used_event - Wrapping(1) < new - old } + + pub(crate) fn reset(&mut self) { + self.ready = false; + self.size = self.max_size; + self.desc_table_address = GuestAddress(0); + self.avail_ring_address = GuestAddress(0); + self.used_ring_address = GuestAddress(0); + self.next_avail = Wrapping(0); + self.next_used = Wrapping(0); + self.num_added = Wrapping(0); + self.uses_notif_suppression = false; + } } #[cfg(kani)] diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 96513e49b26..08f80504b3f 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -6,12 +6,13 @@ use std::sync::atomic::AtomicU32; use std::sync::Arc; use aws_lc_rs::rand; +use libc::IWEVEXPIRED; use vm_memory::GuestMemoryError; use vmm_sys_util::eventfd::EventFd; use super::metrics::METRICS; use super::{RNG_NUM_QUEUES, RNG_QUEUE}; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::gen::virtio_rng::VIRTIO_F_VERSION_1; use crate::devices::virtio::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecBufferMut; @@ -47,7 +48,7 @@ pub struct Entropy { device_state: DeviceState, pub(crate) queues: Vec, queue_events: Vec, - irq_trigger: IrqTrigger, + virtio_interrupt: Option>, // Device specific fields rate_limiter: RateLimiter, @@ -78,7 +79,7 @@ impl Entropy { device_state: DeviceState::Inactive, queues, queue_events, - irq_trigger, + virtio_interrupt: Some(Arc::new(irq_trigger)), rate_limiter, buffer: IoVecBufferMut::new()?, }) @@ -88,9 +89,9 @@ impl Entropy { ENTROPY_DEV_ID } - fn signal_used_queue(&self) -> Result<(), DeviceError> { - self.irq_trigger - .trigger_irq(IrqType::Vring) + fn signal_used_queue(&self, queue_index: usize) -> Result<(), DeviceError> { + self.interrupt() + .trigger(VirtioInterruptType::Queue(queue_index as u16)) .map_err(DeviceError::FailedSignalingIrq) } @@ -188,7 +189,7 @@ impl Entropy { } if used_any { - self.signal_used_queue().unwrap_or_else(|err| { + self.signal_used_queue(RNG_QUEUE).unwrap_or_else(|err| { error!("entropy: {err:?}"); METRICS.entropy_event_fails.inc() }); @@ -237,9 +238,9 @@ impl Entropy { self.acked_features = features; } - pub(crate) fn set_irq_status(&mut self, status: u32) { - self.irq_trigger.irq_status = Arc::new(AtomicU32::new(status)); - } + // pub(crate) fn set_irq_status(&mut self, status: u32) { + // self.irq_trigger.irq_status = Arc::new(AtomicU32::new(status)); + // } pub(crate) fn set_activated(&mut self, mem: GuestMemoryMmap) { self.device_state = DeviceState::Activated(mem); @@ -267,8 +268,8 @@ impl VirtioDevice for Entropy { &self.queue_events } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone() } fn avail_features(&self) -> u64 { @@ -291,7 +292,9 @@ impl VirtioDevice for Entropy { self.device_state.is_activated() } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); + for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 4aa9e449344..7f6f0b3e91d 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -66,7 +66,7 @@ impl Persist<'_> for Entropy { let mut entropy = Entropy::new_with_queues(queues, rate_limiter)?; entropy.set_avail_features(state.virtio_state.avail_features); entropy.set_acked_features(state.virtio_state.acked_features); - entropy.set_irq_status(state.virtio_state.interrupt_status); + // entropy.set_irq_status(state.virtio_state.interrupt_status); if state.virtio_state.activated { entropy.set_activated(constructor_args.0); } diff --git a/src/vmm/src/devices/virtio/test_utils.rs b/src/vmm/src/devices/virtio/test_utils.rs index 9bb66db82ae..b25acc56cbc 100644 --- a/src/vmm/src/devices/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/test_utils.rs @@ -414,7 +414,7 @@ pub(crate) mod test { /// Activate the device pub fn activate_device(&mut self, mem: &'a GuestMemoryMmap) { - self.device.lock().unwrap().activate(mem.clone()).unwrap(); + self.device.lock().unwrap().activate(mem.clone(), None).unwrap(); // Process the activate event let ev_count = self.event_manager.run_with_timeout(100).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs similarity index 97% rename from src/vmm/src/devices/virtio/mmio.rs rename to src/vmm/src/devices/virtio/transport/mmio.rs index 463d11ca2e2..58c148a9cc1 100644 --- a/src/vmm/src/devices/virtio/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -9,7 +9,7 @@ use std::fmt::Debug; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, MutexGuard}; -use crate::devices::virtio::device::{IrqType, VirtioDevice}; +use crate::devices::virtio::device::{IrqType, VirtioDevice, VirtioInterruptType}; use crate::devices::virtio::device_status; use crate::devices::virtio::queue::Queue; use crate::logger::{error, warn}; @@ -187,7 +187,7 @@ impl MmioTransport { let device_activated = self.locked_device().is_activated(); if !device_activated && self.are_queues_valid() { // temporary variable needed for borrow checker - let activate_result = self.locked_device().activate(self.mem.clone()); + let activate_result = self.locked_device().activate(self.mem.clone(), None); if let Err(err) = activate_result { self.device_status |= DEVICE_NEEDS_RESET; @@ -195,8 +195,8 @@ impl MmioTransport { // configuration change interrupt let _ = self .locked_device() - .interrupt_trigger() - .trigger_irq(IrqType::Config); + .interrupt() + .trigger(VirtioInterruptType::Config); error!("Failed to activate virtio device: {}", err) } @@ -373,10 +373,11 @@ impl MmioTransport { #[cfg(test)] pub(crate) mod tests { + use aes_gcm::aes::cipher::inout::IntoArrayError; use vmm_sys_util::eventfd::EventFd; use super::*; - use crate::devices::virtio::device::IrqTrigger; + use crate::devices::virtio::device::{IrqTrigger, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::device_status::DEVICE_NEEDS_RESET; use crate::devices::virtio::ActivateError; use crate::test_utils::single_region_mem; @@ -388,7 +389,7 @@ pub(crate) mod tests { pub(crate) struct DummyDevice { acked_features: u64, avail_features: u64, - interrupt_trigger: IrqTrigger, + virtio_interrupt: Arc, queue_evts: Vec, queues: Vec, device_activated: bool, @@ -401,7 +402,7 @@ pub(crate) mod tests { DummyDevice { acked_features: 0, avail_features: 0, - interrupt_trigger: IrqTrigger::new().unwrap(), + virtio_interrupt: Arc::new(IrqTrigger::new().unwrap()), queue_evts: vec![ EventFd::new(libc::EFD_NONBLOCK).unwrap(), EventFd::new(libc::EFD_NONBLOCK).unwrap(), @@ -447,8 +448,8 @@ pub(crate) mod tests { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.interrupt_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.clone() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -461,7 +462,7 @@ pub(crate) mod tests { } } - fn activate(&mut self, _: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, _: GuestMemoryMmap, _: Option>) -> Result<(), ActivateError> { self.device_activated = true; if self.activate_should_error { Err(ActivateError::EventFd) @@ -892,8 +893,9 @@ pub(crate) mod tests { // We actually wrote to the eventfd assert_eq!( d.locked_device() - .interrupt_trigger() - .irq_evt + .interrupt() + .notifier(VirtioInterruptType::Config) + .unwrap() .read() .unwrap(), 1 diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs new file mode 100644 index 00000000000..6d9df90bc08 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -0,0 +1,11 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +use vmm_sys_util::eventfd::EventFd; +mod pci_common_config; +mod pci_device; +pub(crate) mod mmio; +pub use mmio::MmioTransport; +pub use pci_common_config::{VirtioPciCommonConfig, VIRTIO_PCI_COMMON_CONFIG_ID}; +pub use pci_device::{VirtioPciDevice, VirtioPciDeviceError}; diff --git a/src/vmm/src/devices/virtio/transport/pci_common_config.rs b/src/vmm/src/devices/virtio/transport/pci_common_config.rs new file mode 100644 index 00000000000..c2a45a88ec1 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci_common_config.rs @@ -0,0 +1,409 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::sync::atomic::{AtomicU16, Ordering}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_memory::GuestAddress; +use crate::devices::virtio::queue::Queue; + +use crate::devices::virtio::device::VirtioDevice; + +use crate::logger::{debug, error, info, trace, warn}; +pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; + +#[derive(Clone, Serialize, Deserialize)] +pub struct VirtioPciCommonConfigState { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: u16, + pub msix_queues: Vec, +} + +/* The standard layout for the ring is a continuous chunk of memory which looks + * like this. We assume num is a power of 2. + * + * struct vring + * { + * // The actual descriptors (16 bytes each) + * struct vring_desc desc[num]; + * + * // A ring of available descriptor heads with free-running index. + * __virtio16 avail_flags; + * __virtio16 avail_idx; + * __virtio16 available[num]; + * __virtio16 used_event_idx; + * + * // Padding to the next align boundary. + * char pad[]; + * + * // A ring of used descriptor heads with free-running index. + * __virtio16 used_flags; + * __virtio16 used_idx; + * struct vring_used_elem used[num]; + * __virtio16 avail_event_idx; + * }; + * struct vring_desc { + * __virtio64 addr; + * __virtio32 len; + * __virtio16 flags; + * __virtio16 next; + * }; + * + * struct vring_avail { + * __virtio16 flags; + * __virtio16 idx; + * __virtio16 ring[]; + * }; + * + * // u32 is used here for ids for padding reasons. + * struct vring_used_elem { + * // Index of start of used descriptor chain. + * __virtio32 id; + * // Total length of the descriptor chain which was used (written to) + * __virtio32 len; + * }; +* + * Kernel header used for this reference: include/uapi/linux/virtio_ring.h + * Virtio Spec: https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html + * + */ +const VRING_DESC_ELEMENT_SIZE: usize = 16; +const VRING_AVAIL_ELEMENT_SIZE: usize = 2; +const VRING_USED_ELEMENT_SIZE: usize = 8; +pub enum VringType { + Desc, + Avail, + Used, +} + +pub fn get_vring_size(t: VringType, queue_size: u16) -> u64 { + let (length_except_ring, element_size) = match t { + VringType::Desc => (0, VRING_DESC_ELEMENT_SIZE), + VringType::Avail => (6, VRING_AVAIL_ELEMENT_SIZE), + VringType::Used => (6, VRING_USED_ELEMENT_SIZE), + }; + (length_except_ring + element_size * queue_size as usize) as u64 +} + +/// Contains the data for reading and writing the common configuration structure of a virtio PCI +/// device. +/// +/// * Registers: +/// +/// ** About the whole device. +/// le32 device_feature_select; // 0x00 // read-write +/// le32 device_feature; // 0x04 // read-only for driver +/// le32 driver_feature_select; // 0x08 // read-write +/// le32 driver_feature; // 0x0C // read-write +/// le16 msix_config; // 0x10 // read-write +/// le16 num_queues; // 0x12 // read-only for driver +/// u8 device_status; // 0x14 // read-write (driver_status) +/// u8 config_generation; // 0x15 // read-only for driver +/// +/// ** About a specific virtqueue. +/// le16 queue_select; // 0x16 // read-write +/// le16 queue_size; // 0x18 // read-write, power of 2, or 0. +/// le16 queue_msix_vector; // 0x1A // read-write +/// le16 queue_enable; // 0x1C // read-write (Ready) +/// le16 queue_notify_off; // 0x1E // read-only for driver +/// le64 queue_desc; // 0x20 // read-write +/// le64 queue_avail; // 0x28 // read-write +/// le64 queue_used; // 0x30 // read-write +pub struct VirtioPciCommonConfig { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: Arc, + pub msix_queues: Arc>>, +} + +impl VirtioPciCommonConfig { + pub fn new( + state: VirtioPciCommonConfigState + ) -> Self { + VirtioPciCommonConfig { + driver_status: state.driver_status, + config_generation: state.config_generation, + device_feature_select: state.device_feature_select, + driver_feature_select: state.driver_feature_select, + queue_select: state.queue_select, + msix_config: Arc::new(AtomicU16::new(state.msix_config)), + msix_queues: Arc::new(Mutex::new(state.msix_queues)), + } + } + + fn state(&self) -> VirtioPciCommonConfigState { + VirtioPciCommonConfigState { + driver_status: self.driver_status, + config_generation: self.config_generation, + device_feature_select: self.device_feature_select, + driver_feature_select: self.driver_feature_select, + queue_select: self.queue_select, + msix_config: self.msix_config.load(Ordering::Acquire), + msix_queues: self.msix_queues.lock().unwrap().clone(), + } + } + + pub fn read( + &mut self, + offset: u64, + data: &mut [u8], + device: Arc>, + ) { + assert!(data.len() <= 8); + + match data.len() { + 1 => { + let v = self.read_common_config_byte(offset); + data[0] = v; + } + 2 => { + let v = self.read_common_config_word(offset, device.lock().unwrap().queues()); + LittleEndian::write_u16(data, v); + } + 4 => { + let v = self.read_common_config_dword(offset, device); + LittleEndian::write_u32(data, v); + } + 8 => { + let v = self.read_common_config_qword(offset); + LittleEndian::write_u64(data, v); + } + _ => error!("invalid data length for virtio read: len {}", data.len()), + } + } + + pub fn write( + &mut self, + offset: u64, + data: &[u8], + device: Arc>, + ) { + assert!(data.len() <= 8); + + match data.len() { + 1 => self.write_common_config_byte(offset, data[0]), + 2 => self.write_common_config_word(offset, LittleEndian::read_u16(data), device.lock().unwrap().queues_mut()), + 4 => { + self.write_common_config_dword(offset, LittleEndian::read_u32(data), device) + } + 8 => self.write_common_config_qword(offset, LittleEndian::read_u64(data), device.lock().unwrap().queues_mut()), + _ => error!("invalid data length for virtio write: len {}", data.len()), + } + } + + fn read_common_config_byte(&self, offset: u64) -> u8 { + debug!("read_common_config_byte: offset 0x{:x}", offset); + // The driver is only allowed to do aligned, properly sized access. + match offset { + 0x14 => self.driver_status, + 0x15 => self.config_generation, + _ => { + warn!("invalid virtio config byte read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_byte(&mut self, offset: u64, value: u8) { + debug!("write_common_config_byte: offset 0x{:x}", offset); + match offset { + 0x14 => self.driver_status = value, + _ => { + warn!("invalid virtio config byte write: 0x{:x}", offset); + } + } + } + + fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { + debug!("read_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.load(Ordering::Acquire), + 0x12 => queues.len() as u16, // num_queues + 0x16 => self.queue_select, + 0x18 => self.with_queue(queues, |q| q.size).unwrap_or(0), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize], + 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), + 0x1e => self.queue_select, // notify_off + _ => { + warn!("invalid virtio register word read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { + debug!("write_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.store(value, Ordering::Release), + 0x16 => self.queue_select = value, + 0x18 => self.with_queue_mut(queues, |q| q.size = (value & 0xffff) as u16), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize] = value, + 0x1c => self.with_queue_mut(queues, |q| { + q.ready = value == 1; + }), + _ => { + warn!("invalid virtio register word write: 0x{:x}", offset); + } + } + } + + fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { + debug!("read_common_config_dword: offset 0x{:x}", offset); + match offset { + 0x00 => self.device_feature_select, + 0x04 => { + let locked_device = device.lock().unwrap(); + // Only 64 bits of features (2 pages) are defined for now, so limit + // device_feature_select to avoid shifting by 64 or more bits. + if self.device_feature_select < 2 { + (locked_device.avail_features() >> (self.device_feature_select * 32)) as u32 + } else { + 0 + } + } + 0x08 => self.driver_feature_select, + _ => { + warn!("invalid virtio register dword read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_dword( + &mut self, + offset: u64, + value: u32, + device: Arc>, + ) { + debug!("write_common_config_dword: offset 0x{:x}", offset); + fn hi(v: &mut GuestAddress, x: u32) { + *v = (*v & 0xffff_ffff) | (u64::from(x) << 32) + } + + fn lo(v: &mut GuestAddress, x: u32) { + *v = (*v & !0xffff_ffff) | u64::from(x) + } + + let mut locked_device = device.lock().unwrap(); + + match offset { + 0x00 => self.device_feature_select = value, + 0x08 => self.driver_feature_select = value, + 0x0c => locked_device.ack_features_by_page(self.driver_feature_select, value), + 0x20 => self.with_queue_mut(locked_device.queues_mut(), |q| lo(&mut q.desc_table_address, value)), + 0x24 => self.with_queue_mut(locked_device.queues_mut(), |q| hi(&mut q.desc_table_address, value)), + 0x28 => self.with_queue_mut(locked_device.queues_mut(), |q| lo(&mut q.avail_ring_address, value)), + 0x2c => self.with_queue_mut(locked_device.queues_mut(), |q| hi(&mut q.avail_ring_address, value)), + 0x30 => self.with_queue_mut(locked_device.queues_mut(), |q| lo(&mut q.used_ring_address, value)), + 0x34 => self.with_queue_mut(locked_device.queues_mut(), |q| hi(&mut q.used_ring_address, value)), + _ => { + warn!("invalid virtio register dword write: 0x{:x}", offset); + } + } + } + + fn read_common_config_qword(&self, _offset: u64) -> u64 { + debug!("read_common_config_qword: offset 0x{:x}", _offset); + 0 // Assume the guest has no reason to read write-only registers. + } + + fn write_common_config_qword(&mut self, offset: u64, value: u64, queues: &mut [Queue]) { + debug!("write_common_config_qword: offset 0x{:x}", offset); + + let low = Some((value & 0xffff_ffff) as u32); + let high = Some((value >> 32) as u32); + + match offset { + 0x20 => self.with_queue_mut(queues, |q| q.desc_table_address.0 = value), + 0x28 => self.with_queue_mut(queues, |q| q.avail_ring_address.0 = value), + 0x30 => self.with_queue_mut(queues, |q| q.used_ring_address.0 = value), + _ => { + warn!("invalid virtio register qword write: 0x{:x}", offset); + } + } + } + + fn with_queue(&self, queues: &[Queue], f: F) -> Option + where + F: FnOnce(&Queue) -> U, + { + queues.get(self.queue_select as usize).map(f) + } + + fn with_queue_mut(&self, queues: &mut [Queue], f: F) { + if let Some(queue) = queues.get_mut(self.queue_select as usize) { + f(queue); + } + } +} + +#[cfg(test)] +mod tests { + use crate::devices::virtio::transport::mmio::tests::DummyDevice; + + use super::*; + + + #[test] + fn write_base_regs() { + let mut regs = VirtioPciCommonConfig { + driver_status: 0xaa, + config_generation: 0x55, + device_feature_select: 0x0, + driver_feature_select: 0x0, + queue_select: 0xff, + msix_config: Arc::new(AtomicU16::new(0)), + msix_queues: Arc::new(Mutex::new(vec![0; 3])), + }; + + let dev = Arc::new(Mutex::new(DummyDevice::new())); + // Can set all bits of driver_status. + regs.write(0x14, &[0x55], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x14, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // The config generation register is read only. + regs.write(0x15, &[0xaa], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x15, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // Device features is read-only and passed through from the device. + regs.write(0x04, &[0, 0, 0, 0], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x04, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0u32); + + // Feature select registers are read/write. + regs.write(0x00, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x00, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + regs.write(0x08, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x08, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + + // 'queue_select' can be read and written. + regs.write(0x16, &[0xaa, 0x55], dev.clone()); + let mut read_back = vec![0x00, 0x00]; + regs.read(0x16, &mut read_back, dev); + assert_eq!(read_back[0], 0xaa); + assert_eq!(read_back[1], 0x55); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci_device.rs b/src/vmm/src/devices/virtio/transport/pci_device.rs new file mode 100644 index 00000000000..342d8a18321 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci_device.rs @@ -0,0 +1,1016 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::cmp; +use std::fmt::{Debug, Formatter}; +use std::io::Write; +use std::sync::atomic::{AtomicBool, AtomicU16, AtomicUsize, Ordering}; +use std::sync::{Arc, Barrier, Mutex}; + +use anyhow::anyhow; +use pci::{ + BarReprogrammingParams, MsixCap, MsixConfig, PciBarConfiguration, PciBarRegionType, + PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciDevice, PciDeviceError, + PciHeaderType, PciMassStorageSubclass, PciNetworkControllerSubclass, PciSubclass, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use crate::devices::virtio::device::{VirtioDevice, VirtioInterrupt, VirtioInterruptType}; +use crate::devices::virtio::queue::Queue; +use crate::vstate::memory::GuestMemoryMmap; +use vm_system_allocator::{AddressAllocator, SystemAllocator}; +use vm_device::dma_mapping::ExternalDmaMapping; +use vm_device::interrupt::{ + InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, +}; +use vm_device::{PciBarType, Resource}; +use vm_memory::{Address, ByteValued, GuestAddress, Le32}; +use vmm_sys_util::eventfd::EventFd; + +use super::pci_common_config::VirtioPciCommonConfigState; +use crate::devices::virtio::transport::VirtioPciCommonConfig; +use crate::logger::{debug, error}; + +const DEVICE_INIT: u32 = 0x00; +const DEVICE_ACKNOWLEDGE: u32 = 0x01; +const DEVICE_DRIVER: u32 = 0x02; +const DEVICE_DRIVER_OK: u32 = 0x04; +const DEVICE_FEATURES_OK: u32 = 0x08; +const DEVICE_FAILED: u32 = 0x80; + +const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; +const VIRTIO_F_RING_EVENT_IDX: u32 = 29; +const VIRTIO_F_VERSION_1: u32 = 32; +const VIRTIO_F_IOMMU_PLATFORM: u32 = 33; +const VIRTIO_F_IN_ORDER: u32 = 35; +const VIRTIO_F_ORDER_PLATFORM: u32 = 36; +#[allow(dead_code)] +const VIRTIO_F_SR_IOV: u32 = 37; +const VIRTIO_F_NOTIFICATION_DATA: u32 = 38; + +/// Vector value used to disable MSI for a queue. +const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; + +enum PciCapabilityType { + Common = 1, + Notify = 2, + Isr = 3, + Device = 4, + Pci = 5, + SharedMemory = 8, +} + +// This offset represents the 2 bytes omitted from the VirtioPciCap structure +// as they are already handled through add_capability(). These 2 bytes are the +// fields cap_vndr (1 byte) and cap_next (1 byte) defined in the virtio spec. +const VIRTIO_PCI_CAP_OFFSET: usize = 2; + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCap { + cap_len: u8, // Generic PCI field: capability length + cfg_type: u8, // Identifies the structure. + pci_bar: u8, // Where to find it. + id: u8, // Multiple capabilities of the same type + padding: [u8; 2], // Pad to full dword. + offset: Le32, // Offset within bar. + length: Le32, // Length of the structure, in bytes. +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap {} + +impl PciCapability for VirtioPciCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2; + +impl VirtioPciCap { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, offset: u32, length: u32) -> Self { + VirtioPciCap { + cap_len: (std::mem::size_of::() as u8) + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + } + } +} + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciNotifyCap { + cap: VirtioPciCap, + notify_off_multiplier: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciNotifyCap {} + +impl PciCapability for VirtioPciNotifyCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciNotifyCap { + pub fn new( + cfg_type: PciCapabilityType, + pci_bar: u8, + offset: u32, + length: u32, + multiplier: Le32, + ) -> Self { + VirtioPciNotifyCap { + cap: VirtioPciCap { + cap_len: (std::mem::size_of::() as u8) + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + }, + notify_off_multiplier: multiplier, + } + } +} + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCap64 { + cap: VirtioPciCap, + offset_hi: Le32, + length_hi: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap64 {} + +impl PciCapability for VirtioPciCap64 { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCap64 { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, id: u8, offset: u64, length: u64) -> Self { + VirtioPciCap64 { + cap: VirtioPciCap { + cap_len: (std::mem::size_of::() as u8) + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id, + padding: [0; 2], + offset: Le32::from(offset as u32), + length: Le32::from(length as u32), + }, + offset_hi: Le32::from((offset >> 32) as u32), + length_hi: Le32::from((length >> 32) as u32), + } + } +} + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCfgCap { + cap: VirtioPciCap, + pci_cfg_data: [u8; 4], +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCfgCap {} + +impl PciCapability for VirtioPciCfgCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCfgCap { + fn new() -> Self { + VirtioPciCfgCap { + cap: VirtioPciCap::new(PciCapabilityType::Pci, 0, 0, 0), + ..Default::default() + } + } +} + +#[derive(Clone, Copy, Default)] +struct VirtioPciCfgCapInfo { + offset: usize, + cap: VirtioPciCfgCap, +} + +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciVirtioSubclass { + NonTransitionalBase = 0xff, +} + +impl PciSubclass for PciVirtioSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +// Allocate one bar for the structs pointed to by the capability structures. +// As per the PCI specification, because the same BAR shares MSI-X and non +// MSI-X structures, it is recommended to use 8KiB alignment for all those +// structures. +const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000; +const COMMON_CONFIG_SIZE: u64 = 56; +const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000; +const ISR_CONFIG_SIZE: u64 = 1; +const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000; +const DEVICE_CONFIG_SIZE: u64 = 0x1000; +const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; +const NOTIFICATION_SIZE: u64 = 0x1000; +const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000; +// The size is 256KiB because the table can hold up to 2048 entries, with each +// entry being 128 bits (4 DWORDS). +const MSIX_TABLE_SIZE: u64 = 0x40000; +const MSIX_PBA_BAR_OFFSET: u64 = 0x48000; +// The size is 2KiB because the Pending Bit Array has one bit per vector and it +// can support up to 2048 vectors. +const MSIX_PBA_SIZE: u64 = 0x800; +// The BAR size must be a power of 2. +const CAPABILITY_BAR_SIZE: u64 = 0x80000; +const VIRTIO_COMMON_BAR_INDEX: usize = 0; +const VIRTIO_SHM_BAR_INDEX: usize = 2; + +const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + +const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4; +const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID. + +#[derive(Serialize, Deserialize)] +struct QueueState { + max_size: u16, + size: u16, + ready: bool, + desc_table: u64, + avail_ring: u64, + used_ring: u64, +} + +#[derive(Serialize, Deserialize)] +pub struct VirtioPciDeviceState { + device_activated: bool, + queues: Vec, + interrupt_status: usize, + cap_pci_cfg_offset: usize, + cap_pci_cfg: Vec, +} + +#[derive(Error, Debug)] +pub enum VirtioPciDeviceError { + #[error("Failed creating VirtioPciDevice: {0}")] + CreateVirtioPciDevice(#[source] anyhow::Error), +} +pub type Result = std::result::Result; + +pub struct VirtioPciDevice { + id: String, + + // PCI configuration registers. + configuration: PciConfiguration, + + // virtio PCI common configuration + common_config: VirtioPciCommonConfig, + + // MSI-X config + msix_config: Option>>, + + // Number of MSI-X vectors + msix_num: u16, + + // Virtio device reference and status + device: Arc>, + device_activated: Arc, + + // PCI interrupts. + interrupt_status: Arc, + virtio_interrupt: Option>, + interrupt_source_group: Arc, + + // Guest memory + memory: GuestMemoryMmap, + + // Settings PCI BAR + settings_bar: u8, + + // Whether to use 64-bit bar location or 32-bit + use_64bit_bar: bool, + + // Add a dedicated structure to hold information about the very specific + // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support + // the legacy/backward compatible mechanism of letting the guest access the + // other virtio capabilities without mapping the PCI BARs. This can be + // needed when the guest tries to early access the virtio configuration of + // a device. + cap_pci_cfg_info: VirtioPciCfgCapInfo, + + // Details of bar regions to free + bar_regions: Vec, + + // Optional DMA handler + dma_handler: Option>, +} + +impl Debug for VirtioPciDevice { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("VirtioPciDevice") + .field("id", &self.id) + .finish() + } +} + +impl VirtioPciDevice { + /// Constructs a new PCI transport for the given virtio device. + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msix_num: u16, + interrupt_manager: &Arc>, + pci_device_bdf: u32, + use_64bit_bar: bool, + dma_handler: Option>, + ) -> Result { + let locked_device = device.lock().unwrap(); + + let num_queues = locked_device.queues().len(); + + let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + locked_device.device_type() as u16; + + let interrupt_source_group = interrupt_manager + .create_group(MsiIrqGroupConfig { + base: 0, + count: msix_num as InterruptIndex, + }) + .map_err(|e| { + VirtioPciDeviceError::CreateVirtioPciDevice(anyhow!( + "Failed creating MSI interrupt group: {}", + e + )) + })?; + + let (msix_config, msix_config_clone) = if msix_num > 0 { + let msix_config = Arc::new(Mutex::new( + MsixConfig::new( + msix_num, + interrupt_source_group.clone(), + pci_device_bdf, + None, + ) + .unwrap(), + )); + let msix_config_clone = msix_config.clone(); + (Some(msix_config), Some(msix_config_clone)) + } else { + (None, None) + }; + + let (class, subclass) = match locked_device.device_type() { + TYPE_NET => ( + PciClassCode::NetworkController, + &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass, + ), + TYPE_BLOCK => ( + PciClassCode::MassStorage, + &PciMassStorageSubclass::MassStorage as &dyn PciSubclass, + ), + _ => ( + PciClassCode::Other, + &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass, + ), + }; + + let configuration = PciConfiguration::new( + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + 0x1, // For modern virtio-PCI devices + class, + subclass, + None, + PciHeaderType::Device, + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + msix_config_clone, + None, + ); + + let common_config = VirtioPciCommonConfig::new( + VirtioPciCommonConfigState { + driver_status: 0, + config_generation: 0, + device_feature_select: 0, + driver_feature_select: 0, + queue_select: 0, + msix_config: VIRTQ_MSI_NO_VECTOR, + msix_queues: vec![VIRTQ_MSI_NO_VECTOR; num_queues], + }, + ); + let (device_activated, interrupt_status, cap_pci_cfg_info) = (false, 0, VirtioPciCfgCapInfo::default()); + + // Dropping the MutexGuard to unlock the VirtioDevice. This is required + // in the context of a restore given the device might require some + // activation, meaning it will require locking. Dropping the lock + // prevents from a subtle deadlock. + std::mem::drop(locked_device); + + let mut virtio_pci_device = VirtioPciDevice { + id, + configuration, + common_config, + msix_config, + msix_num, + device, + device_activated: Arc::new(AtomicBool::new(device_activated)), + interrupt_status: Arc::new(AtomicUsize::new(interrupt_status)), + virtio_interrupt: None, + memory, + settings_bar: 0, + use_64bit_bar, + interrupt_source_group, + cap_pci_cfg_info, + bar_regions: vec![], + dma_handler, + }; + + if let Some(msix_config) = &virtio_pci_device.msix_config { + virtio_pci_device.virtio_interrupt = Some(Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_pci_device.common_config.msix_config.clone(), + virtio_pci_device.common_config.msix_queues.clone(), + virtio_pci_device.interrupt_source_group.clone(), + ))); + } + + Ok(virtio_pci_device) + } + + fn is_driver_ready(&self) -> bool { + let ready_bits = + (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK) as u8; + self.common_config.driver_status == ready_bits + && self.common_config.driver_status & DEVICE_FAILED as u8 == 0 + } + + /// Determines if the driver has requested the device (re)init / reset itself + fn is_driver_init(&self) -> bool { + self.common_config.driver_status == DEVICE_INIT as u8 + } + + pub fn config_bar_addr(&self) -> u64 { + self.configuration.get_bar_addr(self.settings_bar as usize) + } + + fn add_pci_capabilities( + &mut self, + settings_bar: u8, + ) -> std::result::Result<(), PciDeviceError> { + // Add pointers to the different configuration structures from the PCI capabilities. + let common_cap = VirtioPciCap::new( + PciCapabilityType::Common, + settings_bar, + COMMON_CONFIG_BAR_OFFSET as u32, + COMMON_CONFIG_SIZE as u32, + ); + self.configuration + .add_capability(&common_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let isr_cap = VirtioPciCap::new( + PciCapabilityType::Isr, + settings_bar, + ISR_CONFIG_BAR_OFFSET as u32, + ISR_CONFIG_SIZE as u32, + ); + self.configuration + .add_capability(&isr_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + // TODO(dgreid) - set based on device's configuration size? + let device_cap = VirtioPciCap::new( + PciCapabilityType::Device, + settings_bar, + DEVICE_CONFIG_BAR_OFFSET as u32, + DEVICE_CONFIG_SIZE as u32, + ); + self.configuration + .add_capability(&device_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let notify_cap = VirtioPciNotifyCap::new( + PciCapabilityType::Notify, + settings_bar, + NOTIFICATION_BAR_OFFSET as u32, + NOTIFICATION_SIZE as u32, + Le32::from(NOTIFY_OFF_MULTIPLIER), + ); + self.configuration + .add_capability(¬ify_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let configuration_cap = VirtioPciCfgCap::new(); + self.cap_pci_cfg_info.offset = self + .configuration + .add_capability(&configuration_cap) + .map_err(PciDeviceError::CapabilitiesSetup)? + + VIRTIO_PCI_CAP_OFFSET; + self.cap_pci_cfg_info.cap = configuration_cap; + + if self.msix_config.is_some() { + let msix_cap = MsixCap::new( + settings_bar, + self.msix_num, + MSIX_TABLE_BAR_OFFSET as u32, + settings_bar, + MSIX_PBA_BAR_OFFSET as u32, + ); + self.configuration + .add_capability(&msix_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + } + + self.settings_bar = settings_bar; + Ok(()) + } + + fn read_cap_pci_cfg(&mut self, offset: usize, mut data: &mut [u8]) { + let cap_slice = self.cap_pci_cfg_info.cap.as_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to read cap_pci_cfg from config space"); + return; + } + + if offset < std::mem::size_of::() { + if let Some(end) = offset.checked_add(data_len) { + // This write can't fail, offset and end are checked against config_len. + data.write_all(&cap_slice[offset..cmp::min(end, cap_len)]) + .unwrap(); + } + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.read_bar(0, bar_offset as u64, data) + } + } + + fn write_cap_pci_cfg(&mut self, offset: usize, data: &[u8]) -> Option> { + let cap_slice = self.cap_pci_cfg_info.cap.as_mut_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to write cap_pci_cfg to config space"); + return None; + } + + if offset < std::mem::size_of::() { + let (_, right) = cap_slice.split_at_mut(offset); + right[..data_len].copy_from_slice(data); + None + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.write_bar(0, bar_offset as u64, data) + } + } + + pub fn virtio_device(&self) -> Arc> { + self.device.clone() + } + + fn needs_activation(&self) -> bool { + !self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready() + } + + pub fn dma_handler(&self) -> Option<&Arc> { + self.dma_handler.as_ref() + } +} + + +pub struct VirtioInterruptMsix { + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, +} + +impl VirtioInterruptMsix { + pub fn new( + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, + ) -> Self { + VirtioInterruptMsix { + msix_config, + config_vector, + queues_vectors, + interrupt_source_group, + } + } +} + + +impl VirtioInterrupt for VirtioInterruptMsix { + fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + if vector == VIRTQ_MSI_NO_VECTOR { + return Ok(()); + } + + let config = &mut self.msix_config.lock().unwrap(); + let entry = &config.table_entries[vector as usize]; + // In case the vector control register associated with the entry + // has its first bit set, this means the vector is masked and the + // device should not inject the interrupt. + // Instead, the Pending Bit Array table is updated to reflect there + // is a pending interrupt for this specific vector. + if config.masked() || entry.masked() { + config.set_pba_bit(vector, false); + return Ok(()); + } + + self.interrupt_source_group + .trigger(vector as InterruptIndex) + } + + fn notifier(&self, int_type: VirtioInterruptType) -> Option { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + self.interrupt_source_group + .notifier(vector as InterruptIndex) + } +} + +impl PciDevice for VirtioPciDevice { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base + offset as usize >= self.cap_pci_cfg_info.offset + && base + offset as usize + data.len() + <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base + offset as usize - self.cap_pci_cfg_info.offset; + self.write_cap_pci_cfg(offset, data) + } else { + self.configuration + .write_config_register(reg_idx, offset, data); + None + } + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base >= self.cap_pci_cfg_info.offset + && base + 4 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base - self.cap_pci_cfg_info.offset; + let mut data = [0u8; 4]; + self.read_cap_pci_cfg(offset, &mut data); + u32::from_le_bytes(data) + } else { + self.configuration.read_reg(reg_idx) + } + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.configuration.detect_bar_reprogramming(reg_idx, data) + } + + fn allocate_bars( + &mut self, + _allocator: &Arc>, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> std::result::Result, PciDeviceError> { + let mut bars = Vec::new(); + let device_clone = self.device.clone(); + let device = device_clone.lock().unwrap(); + + let mut settings_bar_addr = None; + let mut use_64bit_bar = self.use_64bit_bar; + let restoring = resources.is_some(); + if let Some(resources) = resources { + for resource in resources { + if let Resource::PciBar { + index, base, type_, .. + } = resource + { + if index == VIRTIO_COMMON_BAR_INDEX { + settings_bar_addr = Some(GuestAddress(base)); + use_64bit_bar = match type_ { + PciBarType::Io => { + return Err(PciDeviceError::InvalidResource(resource)) + } + PciBarType::Mmio32 => false, + PciBarType::Mmio64 => true, + }; + break; + } + } + } + // Error out if no resource was matching the BAR id. + if settings_bar_addr.is_none() { + return Err(PciDeviceError::MissingResource); + } + } + + // Allocate the virtio-pci capability BAR. + // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 + let (virtio_pci_bar_addr, region_type) = if use_64bit_bar { + let region_type = PciBarRegionType::Memory64BitRegion; + let addr = mmio64_allocator + .allocate( + settings_bar_addr, + CAPABILITY_BAR_SIZE, + Some(CAPABILITY_BAR_SIZE), + ) + .ok_or(PciDeviceError::IoAllocationFailed(CAPABILITY_BAR_SIZE))?; + (addr, region_type) + } else { + let region_type = PciBarRegionType::Memory32BitRegion; + let addr = mmio32_allocator + .allocate( + settings_bar_addr, + CAPABILITY_BAR_SIZE, + Some(CAPABILITY_BAR_SIZE), + ) + .ok_or(PciDeviceError::IoAllocationFailed(CAPABILITY_BAR_SIZE))?; + (addr, region_type) + }; + + let bar = PciBarConfiguration::default() + .set_index(VIRTIO_COMMON_BAR_INDEX) + .set_address(virtio_pci_bar_addr.raw_value()) + .set_size(CAPABILITY_BAR_SIZE) + .set_region_type(region_type); + + // The creation of the PCI BAR and its associated capabilities must + // happen only during the creation of a brand new VM. When a VM is + // restored from a known state, the BARs are already created with the + // right content, therefore we don't need to go through this codepath. + if !restoring { + self.configuration.add_pci_bar(&bar).map_err(|e| { + PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr.raw_value(), e) + })?; + + // Once the BARs are allocated, the capabilities can be added to the PCI configuration. + self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX as u8)?; + } + + bars.push(bar); + + self.bar_regions.clone_from(&bars); + + Ok(bars) + } + + fn free_bars( + &mut self, + _allocator: &mut SystemAllocator, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> std::result::Result<(), PciDeviceError> { + for bar in self.bar_regions.drain(..) { + match bar.region_type() { + PciBarRegionType::Memory32BitRegion => { + mmio32_allocator.free(GuestAddress(bar.addr()), bar.size()); + } + PciBarRegionType::Memory64BitRegion => { + mmio64_allocator.free(GuestAddress(bar.addr()), bar.size()); + } + _ => error!("Unexpected PCI bar type"), + } + } + Ok(()) + } + + fn move_bar( + &mut self, + old_base: u64, + new_base: u64, + ) -> std::result::Result<(), std::io::Error> { + // We only update our idea of the bar in order to support free_bars() above. + // The majority of the reallocation is done inside DeviceManager. + for bar in self.bar_regions.iter_mut() { + if bar.addr() == old_base { + *bar = bar.set_address(new_base); + } + } + + Ok(()) + } + + fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.read( + o - COMMON_CONFIG_BAR_OFFSET, + data, + self.device.clone(), + ), + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.get_mut(0) { + // Reading this register resets it to 0. + *v = self.interrupt_status.swap(0, Ordering::AcqRel) as u8; + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let device = self.device.lock().unwrap(); + device.read_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + } + } + + fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.write( + o - COMMON_CONFIG_BAR_OFFSET, + data, + self.device.clone(), + ), + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.first() { + self.interrupt_status + .fetch_and(!(*v as usize), Ordering::AcqRel); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let mut device = self.device.lock().unwrap(); + device.write_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + #[cfg(feature = "sev_snp")] + for (_event, _addr) in self.ioeventfds(_base) { + if _addr == _base + offset { + _event.write(1).unwrap(); + } + } + // Handled with ioeventfds. + #[cfg(not(feature = "sev_snp"))] + error!("Unexpected write to notification BAR: offset = 0x{:x}", o); + + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + }; + + // Try and activate the device if the driver status has changed + if self.needs_activation() { + debug!("Activating device"); + self.virtio_device().lock().unwrap().activate(self.memory.clone(), self.virtio_interrupt.as_ref().map(Arc::clone)) + .unwrap_or_else(|err| error!("Error activating device: {err:?}")); + } else { + debug!("Device doesn't need activation"); + } + + // Device has been reset by the driver + if self.device_activated.load(Ordering::SeqCst) && self.is_driver_init() { + let mut device = self.device.lock().unwrap(); + let reset_result = device.reset(); + match reset_result { + Some((virtio_interrupt, mut _queue_evts)) => { + // Upon reset the device returns its interrupt EventFD + self.virtio_interrupt = Some(virtio_interrupt); + self.device_activated.store(false, Ordering::SeqCst); + + // Reset queue readiness (changes queue_enable), queue sizes + // and selected_queue as per spec for reset + self.virtio_device().lock().unwrap().queues_mut().iter_mut().for_each(Queue::reset); + self.common_config.queue_select = 0; + } + None => { + error!("Attempt to reset device when not implemented in underlying device"); + self.common_config.driver_status = DEVICE_FAILED as u8; + } + } + } + + None + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn id(&self) -> Option { + Some(self.id.clone()) + } +} + +impl VirtioPciDevice { + pub fn bus_read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + pub fn bus_write(&mut self, base: u64, offset: u64, data: &[u8]) { + self.write_bar(base, offset, data); + } +} \ No newline at end of file diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index ad86c9942af..49f1b7b525a 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -6,6 +6,7 @@ use std::os::fd::AsRawFd; use std::os::unix::net::UnixStream; +use std::sync::Arc; use vhost::vhost_user::message::*; use vhost::vhost_user::{Frontend, VhostUserFrontend}; @@ -17,6 +18,8 @@ use crate::devices::virtio::device::IrqTrigger; use crate::devices::virtio::queue::Queue; use crate::vstate::memory::GuestMemoryMmap; +use super::device::{VirtioInterrupt, VirtioInterruptType}; + /// vhost-user error. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum VhostUserError { @@ -400,7 +403,7 @@ impl VhostUserHandleImpl { &mut self, mem: &GuestMemoryMmap, queues: &[(usize, &Queue, &EventFd)], - irq_trigger: &IrqTrigger, + interrupt: Arc, ) -> Result<(), VhostUserError> { // Provide the memory table to the backend. self.update_mem_table(mem)?; @@ -442,7 +445,7 @@ impl VhostUserHandleImpl { // No matter the queue, we set irq_evt for signaling the guest that buffers were // consumed. self.vu - .set_vring_call(*queue_index, &irq_trigger.irq_evt) + .set_vring_call(*queue_index, &interrupt.notifier(VirtioInterruptType::Queue(*queue_index as u16)).expect("vring irq should be initialized")) .map_err(VhostUserError::VhostUserSetVringCall)?; self.vu @@ -895,11 +898,11 @@ mod tests { queue.initialize(&guest_memory).unwrap(); let event_fd = EventFd::new(0).unwrap(); - let irq_trigger = IrqTrigger::new().unwrap(); + let interrupt = Arc::new(IrqTrigger::new().unwrap()); let queues = [(0, &queue, &event_fd)]; - vuh.setup_backend(&guest_memory, &queues, &irq_trigger) + vuh.setup_backend(&guest_memory, &queues, interrupt.clone()) .unwrap(); // VhostUserHandleImpl should correctly send memory and queues information to @@ -923,7 +926,7 @@ mod tests { log_addr: None, }, base: queue.avail_ring_idx_get(), - call: irq_trigger.irq_evt.as_raw_fd(), + call: interrupt.notifier(VirtioInterruptType::Queue(0)).expect("vring irq should be initialized").as_raw_fd(), kick: event_fd.as_raw_fd(), enable: true, }; diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index bf438aca99f..9d305559924 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -21,6 +21,7 @@ //! - a backend FD. use std::fmt::Debug; +use std::sync::Arc; use log::{error, warn}; use vmm_sys_util::eventfd::EventFd; @@ -29,7 +30,7 @@ use super::super::super::DeviceError; use super::defs::uapi; use super::packet::{VsockPacketRx, VsockPacketTx, VSOCK_PKT_HDR_SIZE}; use super::{defs, VsockBackend}; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::queue::Queue as VirtQueue; use crate::devices::virtio::vsock::metrics::METRICS; use crate::devices::virtio::vsock::VsockError; @@ -60,7 +61,7 @@ pub struct Vsock { pub(crate) backend: B, pub(crate) avail_features: u64, pub(crate) acked_features: u64, - pub(crate) irq_trigger: IrqTrigger, + pub(crate) virtio_interrupt: Option>, // This EventFd is the only one initially registered for a vsock device, and is used to convert // a VirtioDevice::activate call into an EventHandler read event which allows the other events // (queue and backend related) to be registered post virtio device activation. That's @@ -101,7 +102,7 @@ where backend, avail_features: AVAIL_FEATURES, acked_features: 0, - irq_trigger: IrqTrigger::new().map_err(VsockError::EventFd)?, + virtio_interrupt: Some(Arc::new(IrqTrigger::new().map_err(VsockError::EventFd)?)), activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(VsockError::EventFd)?, device_state: DeviceState::Inactive, rx_packet: VsockPacketRx::new()?, @@ -135,9 +136,9 @@ where /// Signal the guest driver that we've used some virtio buffers that it had previously made /// available. - pub fn signal_used_queue(&self) -> Result<(), DeviceError> { - self.irq_trigger - .trigger_irq(IrqType::Vring) + pub fn signal_used_queue(&self, queue_index: usize) -> Result<(), DeviceError> { + self.virtio_interrupt.as_ref().expect("interrupt should be setup") + .trigger(VirtioInterruptType::Queue(queue_index as u16)) .map_err(DeviceError::FailedSignalingIrq) } @@ -257,7 +258,7 @@ where error!("Failed to add used descriptor {}: {}", head.index, err); }); - self.signal_used_queue()?; + self.signal_used_queue(EVQ_INDEX)?; Ok(()) } @@ -295,8 +296,8 @@ where &self.queue_events } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -328,7 +329,9 @@ where ); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); + for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -430,6 +433,6 @@ mod tests { // } // Test a correct activation. - ctx.device.activate(ctx.mem.clone()).unwrap(); + ctx.device.activate(ctx.mem.clone(), None).unwrap(); } } diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index 40e75d1a9f5..e91ac707d3d 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -191,8 +191,17 @@ where Self::PROCESS_NOTIFY_BACKEND => raise_irq = self.notify_backend(evset), _ => warn!("Unexpected vsock event received: {:?}", source), } + let mut queue_index = 0; + match source { + Self::PROCESS_ACTIVATE => self.handle_activate_event(ops), + Self::PROCESS_RXQ => queue_index = RXQ_INDEX, + Self::PROCESS_TXQ => queue_index = TXQ_INDEX, + Self::PROCESS_EVQ => queue_index = EVQ_INDEX, + Self::PROCESS_NOTIFY_BACKEND => queue_index = TXQ_INDEX, // TODO this could be either tx or rx + _ => warn!("Unexpected vsock event received: {:?}", source), + } if raise_irq { - self.signal_used_queue().unwrap_or_default(); + self.signal_used_queue(queue_index).unwrap_or_default(); } } else { warn!( @@ -236,7 +245,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(false); ctx.signal_txq_event(); @@ -253,7 +262,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(true); ctx.signal_txq_event(); @@ -269,7 +278,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(false); ctx.device.backend.set_tx_err(Some(VsockError::NoData)); @@ -285,7 +294,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); // Invalidate the descriptor chain, by setting its length to 0. ctx.guest_txvq.dtable[0].len.set(0); @@ -302,7 +311,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); assert!(!ctx.device.handle_txq_event(EventSet::IN)); } @@ -317,7 +326,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(true); ctx.device.backend.set_rx_err(Some(VsockError::NoData)); @@ -334,7 +343,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(true); ctx.signal_rxq_event(); @@ -347,7 +356,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); // Invalidate the descriptor chain, by setting its length to 0. ctx.guest_rxvq.dtable[0].len.set(0); @@ -363,7 +372,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(false); assert!(!ctx.device.handle_rxq_event(EventSet::IN)); } @@ -388,7 +397,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(true); ctx.device.notify_backend(EventSet::IN); @@ -407,7 +416,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(false); ctx.device.notify_backend(EventSet::IN); @@ -568,7 +577,7 @@ mod tests { vsock .lock() .unwrap() - .activate(test_ctx.mem.clone()) + .activate(test_ctx.mem.clone(), None) .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index dce545fd68d..2b8a3109df7 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -121,8 +121,8 @@ where vsock.acked_features = state.virtio_state.acked_features; vsock.avail_features = state.virtio_state.avail_features; - vsock.irq_trigger.irq_status = - Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); + // vsock.irq_trigger.irq_status = + // Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); vsock.device_state = if state.virtio_state.activated { DeviceState::Activated(constructor_args.mem) } else { diff --git a/src/vmm/src/devices/virtio/vsock/test_utils.rs b/src/vmm/src/devices/virtio/vsock/test_utils.rs index 4a5fdb2c941..40f8285275d 100644 --- a/src/vmm/src/devices/virtio/vsock/test_utils.rs +++ b/src/vmm/src/devices/virtio/vsock/test_utils.rs @@ -5,12 +5,13 @@ #![doc(hidden)] use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::Arc; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; use super::packet::{VsockPacketRx, VsockPacketTx}; -use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::device::{VirtioDevice, VirtioInterrupt}; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::VirtQueue as GuestQ; use crate::devices::virtio::vsock::device::{RXQ_INDEX, TXQ_INDEX}; @@ -191,9 +192,9 @@ pub struct EventHandlerContext<'a> { } impl<'a> EventHandlerContext<'a> { - pub fn mock_activate(&mut self, mem: GuestMemoryMmap) { + pub fn mock_activate(&mut self, mem: GuestMemoryMmap, interrupt: Option>) { // Artificially activate the device. - self.device.activate(mem).unwrap(); + self.device.activate(mem, interrupt).unwrap(); } pub fn signal_txq_event(&mut self) { diff --git a/src/vmm/src/interrupt.rs b/src/vmm/src/interrupt.rs new file mode 100644 index 00000000000..e8b1e80516e --- /dev/null +++ b/src/vmm/src/interrupt.rs @@ -0,0 +1,425 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +// + +use std::collections::HashMap; +use std::fmt::{Debug, Formatter}; +use std::io; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use vm_device::interrupt::{ + InterruptIndex, InterruptManager, InterruptSourceConfig, InterruptSourceGroup, MsiIrqGroupConfig, +}; +use kvm_ioctls::{VmFd}; +use vmm_sys_util::eventfd::EventFd; + +/// Reuse std::io::Result to simplify interoperability among crates. +pub type Result = std::io::Result; + +struct InterruptRoute { + gsi: u32, + irq_fd: EventFd, + registered: AtomicBool, +} + +impl InterruptRoute { + pub fn new(allocator: &mut SystemAllocator) -> Result { + let irq_fd = EventFd::new(libc::EFD_NONBLOCK)?; + let gsi = allocator + .allocate_gsi() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "Failed allocating new GSI"))?; + + Ok(InterruptRoute { + gsi, + irq_fd, + registered: AtomicBool::new(false), + }) + } + + pub fn enable(&self, vm: &VmFd) -> Result<()> { + if !self.registered.load(Ordering::Acquire) { + vm.register_irqfd(&self.irq_fd, self.gsi).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("Failed registering irq_fd: {e}"), + ) + })?; + + // Update internals to track the irq_fd as "registered". + self.registered.store(true, Ordering::Release); + } + + Ok(()) + } + + pub fn disable(&self, vm: &VmFd) -> Result<()> { + if self.registered.load(Ordering::Acquire) { + vm.unregister_irqfd(&self.irq_fd, self.gsi).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("Failed unregistering irq_fd: {e}"), + ) + })?; + + // Update internals to track the irq_fd as "unregistered". + self.registered.store(false, Ordering::Release); + } + + Ok(()) + } + + pub fn trigger(&self) -> Result<()> { + self.irq_fd.write(1) + } + + pub fn notifier(&self) -> Option { + Some( + self.irq_fd + .try_clone() + .expect("Failed cloning interrupt's EventFd"), + ) + } +} + +pub struct RoutingEntry { + route: IrqRoutingEntry, + masked: bool, +} + +pub struct MsiInterruptGroup { + vm: Arc>, + gsi_msi_routes: Arc>>>, + irq_routes: HashMap, +} + +use kvm_bindings::KVM_IRQCHIP_IOAPIC; +use vm_system_allocator::SystemAllocator; + +impl MsiInterruptGroup { + fn set_gsi_routes(&self, routes: &HashMap>) -> Result<()> { + let mut entry_vec: Vec = Vec::new(); + + for i in 0..24 { + let mut kvm_route = kvm_irq_routing_entry { + gsi: i, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + + kvm_route.u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC; + kvm_route.u.irqchip.pin = i; + + entry_vec.push(kvm_route); + } + + for (_, entry) in routes.iter() { + if entry.masked { + continue; + } + entry_vec.push(entry.route); + } + + + let mut irq_routing = + vec_with_array_field::(entry_vec.len()); + irq_routing[0].nr = entry_vec.len() as u32; + irq_routing[0].flags = 0; + + unsafe { + let entries_slice: &mut [kvm_irq_routing_entry] = + irq_routing[0].entries.as_mut_slice(entry_vec.len()); + entries_slice.copy_from_slice(&entry_vec); + } + + self.vm.lock().expect("Poisoned VmFd lock").set_gsi_routing(&irq_routing[0]).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("Failed setting GSI routing: {e}"), + ) + }) + } +} + +impl MsiInterruptGroup { + fn new( + vm: Arc>, + gsi_msi_routes: Arc>>>, + irq_routes: HashMap, + ) -> Self { + MsiInterruptGroup { + vm, + gsi_msi_routes, + irq_routes, + } + } +} + +impl InterruptSourceGroup for MsiInterruptGroup { + fn enable(&self) -> Result<()> { + for (_, route) in self.irq_routes.iter() { + route.enable(&self.vm.lock().expect("Poisoned lock"))?; + } + + Ok(()) + } + + fn disable(&self) -> Result<()> { + for (_, route) in self.irq_routes.iter() { + route.disable(&self.vm.lock().expect("Poisoned lock"))?; + } + + Ok(()) + } + + fn trigger(&self, index: InterruptIndex) -> Result<()> { + if let Some(route) = self.irq_routes.get(&index) { + return route.trigger(); + } + + Err(io::Error::new( + io::ErrorKind::Other, + format!("trigger: Invalid interrupt index {}", index), + )) + } + + fn notifier(&self, index: InterruptIndex) -> Option { + if let Some(route) = self.irq_routes.get(&index) { + return route.notifier(); + } + + None + } + + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> Result<()> { + if let Some(route) = self.irq_routes.get(&index) { + let entry = RoutingEntry::<_>::make_entry(route.gsi, &config)?; + + + // When mask a msi irq, entry.masked is set to be true, + // and the gsi will not be passed to KVM through KVM_SET_GSI_ROUTING. + // So it's required to call disable() (which deassign KVM_IRQFD) before + // set_gsi_routes() to avoid kernel panic (see #3827) + if masked { + route.disable(&self.vm.lock().unwrap())?; + } + + let mut routes = self.gsi_msi_routes.lock().unwrap(); + routes.insert(route.gsi, *entry); + if set_gsi { + self.set_gsi_routes(&routes)?; + } + + // Assign KVM_IRQFD after KVM_SET_GSI_ROUTING to avoid + // panic on kernel which not have commit a80ced6ea514 + // (KVM: SVM: fix panic on out-of-bounds guest IRQ). + if !masked { + route.enable(&self.vm.lock().unwrap())?; + } + + return Ok(()); + } + + Err(io::Error::new( + io::ErrorKind::Other, + format!("update: Invalid interrupt index {index}"), + )) + } + + fn set_gsi(&self) -> Result<()> { + let routes = self.gsi_msi_routes.lock().unwrap(); + self.set_gsi_routes(&routes) + } +} +pub struct MsiInterruptManager { + allocator: Arc>, + vm: Arc>, + gsi_msi_routes: Arc>>>, +} + +impl Debug for MsiInterruptManager { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + // TODO + f.debug_struct("MsiInterruptManager") + .finish() + } +} + +impl MsiInterruptManager { + pub fn new(allocator: Arc>, vm: Arc>) -> Self { + // Create a shared list of GSI that can be shared through all PCI + // devices. This way, we can maintain the full list of used GSI, + // preventing one device from overriding interrupts setting from + // another one. + let gsi_msi_routes = Arc::new(Mutex::new(HashMap::new())); + + MsiInterruptManager { + allocator, + vm, + gsi_msi_routes, + } + } +} + +impl InterruptManager for MsiInterruptManager { + type GroupConfig = MsiIrqGroupConfig; + + fn create_group(&self, config: Self::GroupConfig) -> Result> { + let mut allocator = self.allocator.lock().unwrap(); + let mut irq_routes: HashMap = + HashMap::with_capacity(config.count as usize); + for i in config.base..config.base + config.count { + irq_routes.insert(i, InterruptRoute::new(&mut allocator)?); + } + + Ok(Arc::new(MsiInterruptGroup::new( + self.vm.clone(), + self.gsi_msi_routes.clone(), + irq_routes, + ))) + } + + fn destroy_group(&self, _group: Arc) -> Result<()> { + Ok(()) + } +} + +use super::*; +use kvm_bindings::KVM_MSI_VALID_DEVID; +use kvm_bindings::{kvm_irq_routing_entry, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI}; + +type KvmRoutingEntry = RoutingEntry; +pub type KvmMsiInterruptManager = MsiInterruptManager; + +impl KvmRoutingEntry { + pub fn make_entry( + gsi: u32, + config: &InterruptSourceConfig, + ) -> Result> { + if let InterruptSourceConfig::MsiIrq(cfg) = &config { + let mut kvm_route = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_MSI, + ..Default::default() + }; + + kvm_route.u.msi.address_lo = cfg.low_addr; + kvm_route.u.msi.address_hi = cfg.high_addr; + kvm_route.u.msi.data = cfg.data; + + kvm_route.flags = KVM_MSI_VALID_DEVID; + kvm_route.u.msi.__bindgen_anon_1.devid = cfg.devid; + + let kvm_entry = KvmRoutingEntry { + route: kvm_route, + masked: false, + }; + + return Ok(Box::new(kvm_entry)); + } else if let InterruptSourceConfig::LegacyIrq(cfg) = &config { + let mut kvm_route = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + kvm_route.u.irqchip.irqchip = cfg.irqchip; + kvm_route.u.irqchip.pin = cfg.pin; + let kvm_entry = KvmRoutingEntry { + route: kvm_route, + masked: false, + }; + + return Ok(Box::new(kvm_entry)); + } + + Err(io::Error::new( + io::ErrorKind::Other, + "Interrupt config type not supported", + )) + } +} + +#[cfg(target_arch = "aarch64")] +#[cfg(test)] +mod tests { + use arch::aarch64::gic::kvm::{create_gic, save_pending_tables}; + use arch::aarch64::gic::{ + get_dist_regs, get_icc_regs, get_redist_regs, set_dist_regs, set_icc_regs, set_redist_regs, + }; + + #[test] + fn test_create_gic() { + let hv = hypervisor::new().unwrap(); + let vm = hv.create_vm().unwrap(); + + assert!(create_gic(&vm, 1).is_ok()); + } + + #[test] + fn test_get_set_dist_regs() { + let hv = hypervisor::new().unwrap(); + let vm = hv.create_vm().unwrap(); + let _ = vm.create_vcpu(0, None).unwrap(); + let gic = create_gic(&vm, 1).expect("Cannot create gic"); + + let res = get_dist_regs(gic.device()); + assert!(res.is_ok()); + let state = res.unwrap(); + assert_eq!(state.len(), 649); + + let res = set_dist_regs(gic.device(), &state); + assert!(res.is_ok()); + } + + #[test] + fn test_get_set_redist_regs() { + let hv = hypervisor::new().unwrap(); + let vm = hv.create_vm().unwrap(); + let _ = vm.create_vcpu(0, None).unwrap(); + let gic = create_gic(&vm, 1).expect("Cannot create gic"); + + let mut gicr_typer = Vec::new(); + gicr_typer.push(123); + let res = get_redist_regs(gic.device(), &gicr_typer); + assert!(res.is_ok()); + let state = res.unwrap(); + println!("{}", state.len()); + assert!(state.len() == 24); + + assert!(set_redist_regs(gic.device(), &gicr_typer, &state).is_ok()); + } + + #[test] + fn test_get_set_icc_regs() { + let hv = hypervisor::new().unwrap(); + let vm = hv.create_vm().unwrap(); + let _ = vm.create_vcpu(0, None).unwrap(); + let gic = create_gic(&vm, 1).expect("Cannot create gic"); + + let mut gicr_typer = Vec::new(); + gicr_typer.push(123); + let res = get_icc_regs(gic.device(), &gicr_typer); + assert!(res.is_ok()); + let state = res.unwrap(); + println!("{}", state.len()); + assert!(state.len() == 9); + + assert!(set_icc_regs(gic.device(), &gicr_typer, &state).is_ok()); + } + + #[test] + fn test_save_pending_tables() { + let hv = hypervisor::new().unwrap(); + let vm = hv.create_vm().unwrap(); + let _ = vm.create_vcpu(0, None).unwrap(); + let gic = create_gic(&vm, 1).expect("Cannot create gic"); + + assert!(save_pending_tables(gic.device()).is_ok()); + } +} diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index c80f004e789..3b4007063a3 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -110,6 +110,8 @@ pub mod utils; pub mod vmm_config; /// Module with virtual state structs. pub mod vstate; +/// TODO: Module for MSI interrupts +pub mod interrupt; use std::collections::HashMap; use std::io; @@ -121,9 +123,17 @@ use std::time::Duration; use device_manager::acpi::ACPIDeviceManager; use device_manager::resources::ResourceAllocator; use devices::acpi::vmgenid::VmGenIdError; +use devices::pci_segment::PciSegment; +use devices::virtio::transport::VirtioPciDevice; +use devices::Bus; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; +use kvm_ioctls::{IoEventAddress, NoDatamatch, VmFd}; +use pci::{DeviceRelocation, PciBarRegionType, PciDevice}; use seccompiler::BpfProgram; use userfaultfd::Uffd; +use vm_device::interrupt::{InterruptManager, MsiIrqGroupConfig}; +use vm_memory::{GuestAddress, GuestUsize}; +use vm_system_allocator::{AddressAllocator, SystemAllocator}; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::terminal::Terminal; @@ -153,6 +163,7 @@ use crate::vstate::memory::{ use crate::vstate::vcpu::VcpuState; pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse}; pub use crate::vstate::vm::Vm; +use kvm_bindings::{kvm_irq_routing, kvm_irq_routing_entry as IrqRoutingEntry}; /// Shorthand type for the EventManager flavour used by Firecracker. pub type EventManager = BaseEventManager>>; @@ -261,6 +272,8 @@ pub enum VmmError { VmmObserverTeardown(vmm_sys_util::errno::Error), /// VMGenID error: {0} VMGenID(#[from] VmGenIdError), + /// Unknown + Unknown, } /// Shorthand type for KVM dirty page bitmap. @@ -324,6 +337,12 @@ pub struct Vmm { #[cfg(target_arch = "x86_64")] pio_device_manager: PortIODeviceManager, acpi_device_manager: ACPIDeviceManager, + + // PCI-related + extra_fd: Option>>, + pci_segment: Option, + msi_interrupt_manager: Option>>, + allocator: Option>>, } impl Vmm { @@ -347,7 +366,7 @@ impl Vmm { &self, device_type: DeviceType, device_id: &str, - ) -> Option<&Mutex> { + ) -> Option>> { self.mmio_device_manager.get_device(device_type, device_id) } @@ -857,6 +876,38 @@ impl Vmm { } } +// Returns a `Vec` with a size in bytes at least as large as `size_in_bytes`. +fn vec_with_size_in_bytes(size_in_bytes: usize) -> Vec { + let rounded_size = (size_in_bytes + size_of::() - 1) / size_of::(); + let mut v = Vec::with_capacity(rounded_size); + v.resize_with(rounded_size, T::default); + v +} + +use std::mem::size_of; +// The kvm API has many structs that resemble the following `Foo` structure: +// +// ``` +// #[repr(C)] +// struct Foo { +// some_data: u32 +// entries: __IncompleteArrayField<__u32>, +// } +// ``` +// +// In order to allocate such a structure, `size_of::()` would be too small because it would not +// include any space for `entries`. To make the allocation large enough while still being aligned +// for `Foo`, a `Vec` is created. Only the first element of `Vec` would actually be used +// as a `Foo`. The remaining memory in the `Vec` is for `entries`, which must be contiguous +// with `Foo`. This function is used to make the `Vec` with enough space for `count` entries. +/// Helper function to create Vec of specific size. +pub fn vec_with_array_field(count: usize) -> Vec { + let element_space = count * size_of::(); + let vec_size_bytes = size_of::() + element_space; + vec_with_size_in_bytes(vec_size_bytes) +} + + /// Process the content of the MPIDR_EL1 register in order to be able to pass it to KVM /// /// The kernel expects to find the four affinity levels of the MPIDR in the first 32 bits of the @@ -968,3 +1019,137 @@ impl MutEventSubscriber for Vmm { } } } + +struct AddressManager { + pub(crate) allocator: Arc>, + #[cfg(target_arch = "x86_64")] + pub(crate) io_bus: Arc, + pub(crate) mmio_bus: Arc, + pub(crate) vm: Arc>, + pci_mmio32_allocators: Vec>>, + pci_mmio64_allocators: Vec>>, +} + +// TODO implement this in a more granular way +impl DeviceRelocation for AddressManager { + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + pci_dev: &mut dyn PciDevice, + region_type: PciBarRegionType, + ) -> std::result::Result<(), std::io::Error> { + match region_type { + PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] + { + // Update system allocator + self.allocator + .lock() + .unwrap() + .free_io_addresses(GuestAddress(old_base), len as GuestUsize); + + self.allocator + .lock() + .unwrap() + .allocate_io_addresses( + Some(GuestAddress(new_base)), + len as GuestUsize, + None, + ) + .ok_or_else(|| { + io::Error::new(io::ErrorKind::Other, "failed allocating new IO range") + })?; + + // Update PIO bus + self.io_bus + .update_range(old_base, len, new_base, len) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + } + #[cfg(target_arch = "aarch64")] + error!("I/O region is not supported"); + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + let allocators = self.pci_mmio32_allocators + .iter() + .chain(self.pci_mmio64_allocators.iter()); + + // Find the specific allocator that this BAR was allocated from and use it for new one + for allocator in allocators.clone() { + let allocator_base = allocator.lock().unwrap().base(); + let allocator_end = allocator.lock().unwrap().end(); + + if old_base >= allocator_base.0 && old_base <= allocator_end.0 { + allocator + .lock() + .unwrap() + .free(GuestAddress(old_base), len as GuestUsize); + break; + } + } + + for allocator in allocators { + let allocator_base = allocator.lock().unwrap().base(); + let allocator_end = allocator.lock().unwrap().end(); + + if new_base >= allocator_base.0 && new_base <= allocator_end.0 { + allocator + .lock() + .unwrap() + .allocate(Some(GuestAddress(new_base)), len as GuestUsize, Some(len)) + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + "failed allocating new MMIO range", + ) + })?; + + break; + } + } + + // Update MMIO bus + self.mmio_bus + .update_range(old_base, len, new_base, len) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + } + } + + let any_dev = pci_dev.as_any(); + if let Some(virtio_pci_dev) = any_dev.downcast_ref::() { + let bar_addr = virtio_pci_dev.config_bar_addr(); + if bar_addr == new_base { + const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; + const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + + let old_notify_base = old_base + NOTIFICATION_BAR_OFFSET; + for (i, queue_evt) in virtio_pci_dev.virtio_device().lock().unwrap().queue_events().iter().enumerate() { + let addr = old_notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER); + let io_addr = IoEventAddress::Mmio(addr); + self.vm.lock().unwrap().unregister_ioevent(queue_evt, &io_addr, NoDatamatch).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to unregister ioevent: {e:?}"), + ) + })?; + } + let new_notify_base = new_base + NOTIFICATION_BAR_OFFSET; + for (i, queue_evt) in virtio_pci_dev.virtio_device().lock().unwrap().queue_events().iter().enumerate() { + let addr = new_notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER); + let io_addr = IoEventAddress::Mmio(addr); + self.vm.lock().unwrap() + .register_ioevent(queue_evt, &io_addr, NoDatamatch) + .map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to register ioevent: {e:?}"), + ) + })?; + } + } + } + + pci_dev.move_bar(old_base, new_base) + } +} \ No newline at end of file diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index a4d15641975..190cd1debe1 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -28,6 +28,7 @@ use crate::vmm_config::metrics::{init_metrics, MetricsConfig, MetricsConfigError use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; use crate::vmm_config::vsock::*; +use crate::vmm_config::pci::PciConfig; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryMmap, MemoryError}; /// Errors encountered when configuring microVM resources. @@ -86,6 +87,11 @@ pub struct VmmConfig { vsock_device: Option, #[serde(rename = "entropy")] entropy_device: Option, + #[cfg(feature = "gdb")] + #[serde(rename = "gdb-socket")] + gdb_socket_addr: Option, + #[serde(rename = "pci")] + pci_config: Option, } /// A data structure that encapsulates the device configurations @@ -114,6 +120,10 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, + #[cfg(feature = "gdb")] + /// Configures the location of the GDB socket + pub gdb_socket_addr: Option, + pub pci_config: Option, } impl VmResources { @@ -168,6 +178,11 @@ impl VmResources { resources.set_balloon_device(balloon_config)?; } + if let Some(pci_config) = vmm_config.pci_config { + resources.pci_config = Some(pci_config.clone()); + } + + // Init the data store from file, if present. if let Some(data) = metadata_json { resources.locked_mmds_or_default().put_data( @@ -521,6 +536,9 @@ impl From<&VmResources> for VmmConfig { net_devices: resources.net_builder.configs(), vsock_device: resources.vsock.config(), entropy_device: resources.entropy.config(), + #[cfg(feature = "gdb")] + gdb_socket_addr: resources.gdb_socket_addr.clone(), + pci_config: resources.pci_config.clone(), // TODO snapshot-restore support } } } @@ -630,6 +648,9 @@ mod tests { boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, entropy: Default::default(), + #[cfg(feature = "gdb")] + gdb_socket_addr: None, + pci_config: None, } } diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index c7afc5fc65f..0e355fe2502 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -30,6 +30,8 @@ pub mod metrics; pub mod mmds; /// Wrapper for configuring the network devices attached to the microVM. pub mod net; +/// Configuration for PCI +pub mod pci; /// Wrapper for configuring microVM snapshots and the microVM state. pub mod snapshot; /// Wrapper for configuring the vsock devices attached to the microVM. diff --git a/src/vmm/src/vmm_config/pci.rs b/src/vmm/src/vmm_config/pci.rs new file mode 100644 index 00000000000..4bd4ecd67d1 --- /dev/null +++ b/src/vmm/src/vmm_config/pci.rs @@ -0,0 +1,13 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct PciConfig { + pub enabled: bool, + pub vfio_devices: Option>, +} + +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct VfioDeviceConfig { + // sysfs path of the device + pub path: String, +} \ No newline at end of file diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs index cb63afa4579..c68f289fb82 100644 --- a/src/vmm/src/vstate/vcpu/mod.rs +++ b/src/vmm/src/vstate/vcpu/mod.rs @@ -528,7 +528,9 @@ fn handle_kvm_exit( VcpuExit::MmioRead(addr, data) => { if let Some(mmio_bus) = &peripherals.mmio_bus { let _metric = METRICS.vcpu.exit_mmio_read_agg.record_latency_metrics(); - mmio_bus.read(addr, data); + if !(mmio_bus.read(addr, data)) { + error!("Unhandled mmio read at {:x}", addr); + } METRICS.vcpu.exit_mmio_read.inc(); } Ok(VcpuEmulation::Handled) @@ -536,7 +538,9 @@ fn handle_kvm_exit( VcpuExit::MmioWrite(addr, data) => { if let Some(mmio_bus) = &peripherals.mmio_bus { let _metric = METRICS.vcpu.exit_mmio_write_agg.record_latency_metrics(); - mmio_bus.write(addr, data); + if !mmio_bus.write(addr, data) { + error!("Unhandled mmio write at {:x}", addr); + } METRICS.vcpu.exit_mmio_write.inc(); } Ok(VcpuEmulation::Handled) diff --git a/src/vmm/src/vstate/vcpu/x86_64.rs b/src/vmm/src/vstate/vcpu/x86_64.rs index 6fee3933435..ec1be819e1b 100644 --- a/src/vmm/src/vstate/vcpu/x86_64.rs +++ b/src/vmm/src/vstate/vcpu/x86_64.rs @@ -634,14 +634,15 @@ impl Peripherals { VcpuExit::IoIn(addr, data) => { if let Some(pio_bus) = &self.pio_bus { let _metric = METRICS.vcpu.exit_io_in_agg.record_latency_metrics(); - pio_bus.read(u64::from(addr), data); + if !pio_bus.read(u64::from(addr), data) { + error!("Unhandled PIO read {:x}", addr); + } METRICS.vcpu.exit_io_in.inc(); } Ok(VcpuEmulation::Handled) } VcpuExit::IoOut(addr, data) => { if let Some(pio_bus) = &self.pio_bus { - let _metric = METRICS.vcpu.exit_io_out_agg.record_latency_metrics(); pio_bus.write(u64::from(addr), data); METRICS.vcpu.exit_io_out.inc(); } @@ -1171,7 +1172,7 @@ mod tests { // Regression test for #4666 let kvm = Kvm::new().unwrap(); - let vm = Vm::new(Vec::new()).unwrap(); + let (vm, _) = Vm::new(Vec::new()).unwrap(); let vcpu = KvmVcpu::new(0, &vm).unwrap(); // The list of supported MSR indices, in the order they were returned by KVM diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 0f72abcf68f..98e9dac7c2d 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -17,6 +17,7 @@ use kvm_bindings::{ use kvm_bindings::{kvm_userspace_memory_region, KVM_API_VERSION, KVM_MEM_LOG_DIRTY_PAGES}; use kvm_ioctls::{Kvm, VmFd}; use serde::{Deserialize, Serialize}; +use std::os::unix::io::AsRawFd; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::GICDevice; @@ -132,7 +133,7 @@ pub struct Vm { /// Contains Vm functions that are usable across CPU architectures impl Vm { /// Constructs a new `Vm` using the given `Kvm` instance. - pub fn new(kvm_cap_modifiers: Vec) -> Result { + pub fn new(kvm_cap_modifiers: Vec) -> Result<(Self, VmFd), VmError> { let kvm = Kvm::new().map_err(VmError::Kvm)?; // Check that KVM has the correct version. @@ -150,16 +151,10 @@ impl Vm { // Create fd for interacting with kvm-vm specific functions. let vm_fd = kvm.create_vm().map_err(VmError::VmFd)?; - #[cfg(target_arch = "aarch64")] - { - Ok(Vm { - fd: vm_fd, - max_memslots, - kvm_cap_modifiers, - irqchip_handle: None, - }) - } - + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let supported_cpuid = kvm + .get_supported_cpuid(KVM_MAX_CPUID_ENTRIES) + .map_err(VmError::VmFd)?; #[cfg(target_arch = "x86_64")] { let supported_cpuid = kvm @@ -167,13 +162,19 @@ impl Vm { .map_err(VmError::VmFd)?; let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm)?; - Ok(Vm { - fd: vm_fd, - max_memslots, - kvm_cap_modifiers, - supported_cpuid, - msrs_to_save, - }) + let rawfd = unsafe { libc::dup(vm_fd.as_raw_fd()) }; + let extra_fd = unsafe { kvm.create_vmfd_from_rawfd(rawfd).unwrap() }; + + Ok(( + Vm { + fd: vm_fd, + max_memslots, + kvm_cap_modifiers, + supported_cpuid, + msrs_to_save, + }, + extra_fd + )) } } @@ -468,6 +469,7 @@ impl fmt::Debug for VmState { #[cfg(test)] pub(crate) mod tests { use super::*; + use crate::cpu_config::templates::KvmCapability; #[cfg(target_arch = "x86_64")] use crate::snapshot::Snapshot; use crate::test_utils::single_region_mem; @@ -477,7 +479,7 @@ pub(crate) mod tests { pub(crate) fn setup_vm(mem_size: usize) -> (Vm, GuestMemoryMmap) { let gm = single_region_mem(mem_size); - let vm = Vm::new(vec![]).expect("Cannot create new vm"); + let (vm, _) = Vm::new(vec![]).expect("Cannot create new vm"); vm.memory_init(&gm, false).unwrap(); (vm, gm) @@ -509,7 +511,7 @@ pub(crate) mod tests { #[test] fn test_vm_memory_init() { - let vm = Vm::new(vec![]).expect("Cannot create new vm"); + let (vm, _) = Vm::new(vec![]).expect("Cannot create new vm"); // Create valid memory region and test that the initialization is successful. let gm = single_region_mem(0x1000); @@ -519,7 +521,7 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] #[test] fn test_vm_save_restore_state() { - let vm = Vm::new(vec![]).expect("new vm failed"); + let (vm, _) = Vm::new(vec![]).expect("new vm failed"); // Irqchips, clock and pitstate are not configured so trying to save state should fail. vm.save_state().unwrap_err(); @@ -587,7 +589,7 @@ pub(crate) mod tests { #[test] fn test_set_kvm_memory_regions() { - let vm = Vm::new(vec![]).expect("Cannot create new vm"); + let (vm, _) = Vm::new(vec![]).expect("Cannot create new vm"); let gm = single_region_mem(0x1000); let res = vm.set_kvm_memory_regions(&gm, false);