From 7dd75ac4e4454807bb6f1ea70f2d47a376101cf9 Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Mon, 12 Apr 2021 16:18:36 +0300 Subject: [PATCH 01/22] rebase: Experimenting with PCI passthrough original: I've taken PCI emulation and vfio code from Cloud hypervisor and glued it to the Firecracker PIO/MMIO bus to passthrough a hardcoded GPU. Works only on g4dn.metal hosts with T4 GPUs. Guest NVIDIA drivers loads successfuly and tooling like nvidia-smi works but could not get CUDA to run. Signed-off-by: Andrei Sandu --- Cargo.lock | 130 +- src/pci/Cargo.toml | 22 + src/pci/src/bus.rs | 470 +++++++ src/pci/src/configuration.rs | 1008 ++++++++++++++ src/pci/src/device.rs | 116 ++ src/pci/src/lib.rs | 42 + src/pci/src/msi.rs | 236 ++++ src/pci/src/msix.rs | 453 ++++++ src/pci/src/vfio.rs | 1233 +++++++++++++++++ src/vm-device/Cargo.toml | 16 + src/vm-device/src/bus.rs | 362 +++++ src/vm-device/src/dma_mapping/mod.rs | 17 + src/vm-device/src/dma_mapping/vfio.rs | 73 + src/vm-device/src/interrupt/mod.rs | 197 +++ src/vm-device/src/lib.rs | 54 + src/vm-system-allocator/Cargo.toml | 9 + src/vm-system-allocator/src/address.rs | 393 ++++++ .../src/arch/aarch64/layout.rs | 84 ++ .../src/arch/aarch64/mod.rs | 11 + src/vm-system-allocator/src/arch/mod.rs | 19 + .../src/arch/x86_64/layout.rs | 9 + .../src/arch/x86_64/mod.rs | 9 + src/vm-system-allocator/src/gsi.rs | 108 ++ src/vm-system-allocator/src/lib.rs | 25 + src/vm-system-allocator/src/system.rs | 162 +++ src/vmm/Cargo.toml | 5 + src/vmm/src/arch/mod.rs | 6 +- src/vmm/src/arch/x86_64/layout.rs | 19 + src/vmm/src/builder.rs | 234 +++- .../x86_64/cpuid/intel/normalize.rs | 4 +- src/vmm/src/device_manager/legacy.rs | 12 +- src/vmm/src/device_manager/mmio.rs | 27 +- src/vmm/src/devices/bus.rs | 93 +- src/vmm/src/devices/legacy/rtc_pl031.rs | 4 +- src/vmm/src/devices/mod.rs | 2 + src/vmm/src/devices/pci.rs | 493 +++++++ src/vmm/src/interrupt.rs | 431 ++++++ src/vmm/src/lib.rs | 34 + src/vmm/src/vstate/vcpu/mod.rs | 8 +- src/vmm/src/vstate/vcpu/x86_64.rs | 7 +- src/vmm/src/vstate/vm.rs | 46 +- 41 files changed, 6604 insertions(+), 79 deletions(-) create mode 100644 src/pci/Cargo.toml create mode 100644 src/pci/src/bus.rs create mode 100644 src/pci/src/configuration.rs create mode 100644 src/pci/src/device.rs create mode 100644 src/pci/src/lib.rs create mode 100644 src/pci/src/msi.rs create mode 100644 src/pci/src/msix.rs create mode 100644 src/pci/src/vfio.rs create mode 100644 src/vm-device/Cargo.toml create mode 100644 src/vm-device/src/bus.rs create mode 100644 src/vm-device/src/dma_mapping/mod.rs create mode 100644 src/vm-device/src/dma_mapping/vfio.rs create mode 100644 src/vm-device/src/interrupt/mod.rs create mode 100644 src/vm-device/src/lib.rs create mode 100644 src/vm-system-allocator/Cargo.toml create mode 100644 src/vm-system-allocator/src/address.rs create mode 100644 src/vm-system-allocator/src/arch/aarch64/layout.rs create mode 100644 src/vm-system-allocator/src/arch/aarch64/mod.rs create mode 100644 src/vm-system-allocator/src/arch/mod.rs create mode 100644 src/vm-system-allocator/src/arch/x86_64/layout.rs create mode 100644 src/vm-system-allocator/src/arch/x86_64/mod.rs create mode 100644 src/vm-system-allocator/src/gsi.rs create mode 100644 src/vm-system-allocator/src/lib.rs create mode 100644 src/vm-system-allocator/src/system.rs create mode 100644 src/vmm/src/devices/pci.rs create mode 100644 src/vmm/src/interrupt.rs diff --git a/Cargo.lock b/Cargo.lock index 45324e6a5c3..5a643c7c622 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,7 +9,7 @@ dependencies = [ "displaydoc", "thiserror", "vm-memory", - "zerocopy 0.8.2", + "zerocopy 0.8.4", ] [[package]] @@ -111,6 +111,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "anyhow" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" + [[package]] name = "arrayvec" version = "0.7.6" @@ -129,7 +135,7 @@ version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec4795bbabc13643a8b3532184041ab41dec5740046aa15734428219cb9a0bfc" dependencies = [ - "bindgen 0.69.4", + "bindgen 0.69.5", "cmake", "dunce", "fs_extra", @@ -157,7 +163,7 @@ version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df7a4168111d7eb622a31b214057b8509c0a7e1794f44c546d742330dc793972" dependencies = [ - "bindgen 0.69.4", + "bindgen 0.69.5", "cc", "cmake", "dunce", @@ -203,14 +209,14 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.69.4" +version = "0.69.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" +checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" dependencies = [ "bitflags 2.6.0", "cexpr", "clang-sys", - "itertools 0.10.5", + "itertools 0.12.1", "lazy_static", "lazycell", "log", @@ -260,9 +266,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.28" +version = "1.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1" +checksum = "58e804ac3194a48bb129643eb1d62fcc20d18c6b8c181704489353d13120bcd1" dependencies = [ "jobserver", "libc", @@ -340,9 +346,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.19" +version = "4.5.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615" +checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" dependencies = [ "clap_builder", "clap_derive", @@ -359,9 +365,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.19" +version = "4.5.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b" +checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" dependencies = [ "anstream", "anstyle", @@ -649,9 +655,9 @@ dependencies = [ [[package]] name = "gdbstub_arch" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e3b1357bd3203fc09a6601327ae0ab38865d14231d0b65d3143f5762cc7977d" +checksum = "328a9e9425db13770d0d11de6332a608854266e44c53d12776be7b4aa427e3de" dependencies = [ "gdbstub", "num-traits", @@ -782,6 +788,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.13.0" @@ -1027,6 +1042,24 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pci" +version = "0.1.0" +dependencies = [ + "anyhow", + "byteorder", + "kvm-bindings", + "kvm-ioctls", + "libc", + "log", + "vfio-bindings 0.2.0", + "vfio-ioctls", + "vm-device", + "vm-memory", + "vm-system-allocator", + "vmm-sys-util", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1518,6 +1551,36 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "vfio-bindings" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a21f546f2bda37f5a8cfb138c87f95b8e34d2d78d6a7a92ba3785f4e08604a7" +dependencies = [ + "vmm-sys-util", +] + +[[package]] +name = "vfio-bindings" +version = "0.4.0" +source = "git+https://github.com/rust-vmm/vfio?branch=main#a51a4746b0d317bfc21fa49d40f9287f3b8137fd" + +[[package]] +name = "vfio-ioctls" +version = "0.2.0" +source = "git+https://github.com/rust-vmm/vfio?branch=main#a51a4746b0d317bfc21fa49d40f9287f3b8137fd" +dependencies = [ + "byteorder", + "kvm-bindings", + "kvm-ioctls", + "libc", + "log", + "thiserror", + "vfio-bindings 0.4.0", + "vm-memory", + "vmm-sys-util", +] + [[package]] name = "vhost" version = "0.12.0" @@ -1541,6 +1604,20 @@ dependencies = [ "thiserror", ] +[[package]] +name = "vm-device" +version = "0.1.0" +dependencies = [ + "anyhow", + "serde", + "serde_derive", + "serde_json", + "thiserror", + "vfio-ioctls", + "vm-memory", + "vmm-sys-util", +] + [[package]] name = "vm-fdt" version = "0.3.0" @@ -1564,6 +1641,14 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3428ee25acbfc75ed14600f2043876e0889cbd57c39dd441191417377cdceda0" +[[package]] +name = "vm-system-allocator" +version = "0.1.0" +dependencies = [ + "libc", + "vm-memory", +] + [[package]] name = "vmm" version = "0.1.0" @@ -1575,6 +1660,7 @@ dependencies = [ "base64", "bincode", "bitflags 2.6.0", + "byteorder", "crc64", "criterion", "derive_more", @@ -1593,6 +1679,7 @@ dependencies = [ "log-instrument", "memfd", "micro_http", + "pci", "proptest", "seccompiler", "semver", @@ -1603,13 +1690,16 @@ dependencies = [ "timerfd", "userfaultfd", "utils", + "vfio-ioctls", "vhost", "vm-allocator", + "vm-device", "vm-fdt", "vm-memory", "vm-superio", + "vm-system-allocator", "vmm-sys-util", - "zerocopy 0.8.2", + "zerocopy 0.8.4", ] [[package]] @@ -1786,11 +1876,11 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdf8d0ac51277f0e70d6dcb000b7bfa817968d66df9f5772e731a1d1bc6fc5c6" +checksum = "f39ef66148c23d1ab5acda9ae26d65b88050b79e2ef638e8b560f869d991775c" dependencies = [ - "zerocopy-derive 0.8.2", + "zerocopy-derive 0.8.4", ] [[package]] @@ -1806,9 +1896,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.8.2" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cf1fea9437ee18b719f41c597b00c1745d7ff77184daf6ac8c61110a0115161" +checksum = "88ac5bbf101d2213edf0a2ee03242f5fa15be9907123e13dc770e21a0b5b670e" dependencies = [ "proc-macro2", "quote", diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml new file mode 100644 index 00000000000..23b0403a53b --- /dev/null +++ b/src/pci/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "pci" +version = "0.1.0" +authors = ["Samuel Ortiz "] +edition = "2018" + +[dependencies] +anyhow = "1.0" +byteorder = "1.4.3" +vmm-sys-util = ">=0.3.1" +libc = ">=0.2.39" +log = { version = "0.4.22", features = ["std", "serde"] } +vm-memory = { version = "0.15.0", features = ["backend-mmap", "backend-bitmap"] } +vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", branch = "main" } +kvm-bindings = { version = "0.9.1", features = ["fam-wrappers"] } +kvm-ioctls = "0.18.0" +vm-device = { path = "../vm-device"} +vm-system-allocator = { path = "../vm-system-allocator" } + +[dependencies.vfio-bindings] +version = "0.2.0" +features = ["fam-wrappers"] \ No newline at end of file diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs new file mode 100644 index 00000000000..c454c4adc30 --- /dev/null +++ b/src/pci/src/bus.rs @@ -0,0 +1,470 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. + +use crate::configuration::{ + PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, +}; +use crate::device::PciDevice; +use byteorder::{ByteOrder, LittleEndian}; +use log::error; +use std::any::Any; +use std::collections::HashMap; +use std::sync::{Arc, Barrier, Mutex}; +use vm_memory::{Address, GuestAddress, GuestUsize}; + +const VENDOR_ID_INTEL: u16 = 0x8086; +const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; +const NUM_DEVICE_IDS: usize = 32; + +/// Errors for device manager. +#[derive(Debug)] +pub enum PciRootError { + /// Could not allocate an IRQ number. + AllocateIrq, + /// Could not find an available device slot on the PCI bus. + NoPciDeviceSlotAvailable, + /// Invalid PCI device identifier provided. + InvalidPciDeviceSlot(usize), + /// Valid PCI device identifier but already used. + AlreadyInUsePciDeviceSlot(usize), +} +pub type Result = std::result::Result; + +/// Emulates the PCI Root bridge device. +pub struct PciRoot { + /// Configuration space. + config: PciConfiguration, +} + +impl PciRoot { + /// Create an empty PCI root bridge. + pub fn new(config: Option) -> Self { + if let Some(config) = config { + PciRoot { config } + } else { + PciRoot { + config: PciConfiguration::new( + VENDOR_ID_INTEL, + DEVICE_ID_INTEL_VIRT_PCIE_HOST, + 0, + PciClassCode::BridgeDevice, + &PciBridgeSubclass::HostBridge, + None, + PciHeaderType::Device, + 0, + 0, + None, + ), + } + } + } +} + +impl BusDevice for PciRoot {} + +impl PciDevice for PciRoot { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.config.write_config_register(reg_idx, offset, data); + None + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.config.read_reg(reg_idx) + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } +} + +pub struct PciBus { + /// Devices attached to this bus. + /// Device 0 is host bridge. + devices: HashMap>>, + device_ids: Vec, +} + +impl PciBus { + pub fn new(pci_root: PciRoot) -> Self { + let mut devices: HashMap>> = HashMap::new(); + let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; + + devices.insert(0, Arc::new(Mutex::new(pci_root))); + device_ids[0] = true; + + PciBus { + devices, + device_ids, + } + } + + pub fn register_mapping( + &self, + dev: Arc>, + io_bus: &mut Bus, + mmio_bus: &mut Bus, + bars: Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, + ) -> Result<()> { + for (address, size, type_) in bars { + match type_ { + PciBarRegionType::IoRegion => { + io_bus + .insert(dev.clone(), address.raw_value(), size) + .unwrap(); + error!("cannot register bus mappings {:x} {:x} IO", address.0, size); + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + error!("Registering bus mappings {:x} {:x}", address.0, size); + mmio_bus + .insert(dev.clone(), address.raw_value(), size) + .unwrap(); + } + } + } + Ok(()) + } + + pub fn add_device( + &mut self, + pci_device_bdf: u32, + device: Arc>, + ) -> Result<()> { + self.devices.insert(pci_device_bdf >> 3, device); + Ok(()) + } + + pub fn remove_by_device(&mut self, device: &Arc>) -> Result<()> { + self.devices.retain(|_, dev| !Arc::ptr_eq(dev, device)); + Ok(()) + } + + pub fn next_device_id(&mut self) -> Result { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u32); + } + } + + Err(PciRootError::NoPciDeviceSlotAvailable) + } + + pub fn get_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + if !self.device_ids[id] { + self.device_ids[id] = true; + Ok(()) + } else { + Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } + + pub fn put_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + self.device_ids[id] = false; + Ok(()) + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } +} + +pub struct PciConfigIo { + /// Config space register. + config_address: u32, + pci_bus: Arc>, +} + +impl PciConfigIo { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigIo { + pci_bus, + config_address: 0, + } + } + + pub fn config_space_read(&self) -> u32 { + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return 0xffff_ffff; + } + + let (bus, device, function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + error!( + "config space read {}:{}:{} reg {}", + bus, device, function, register + ); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + // Don't support multi-function devices. + if function > 0 { + return 0xffff_ffff; + } + + self.pci_bus + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + pub fn config_space_write(&mut self, offset: u64, data: &[u8]) -> Option> { + if offset as usize + data.len() > 4 { + return None; + } + + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return None; + } + + let (bus, device, _function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return None; + } + + let pci_bus = self.pci_bus.lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + if let Some(params) = device.detect_bar_reprogramming(register, data) { + // if let Err(e) = pci_bus.device_reloc.move_bar( + // params.old_base, + // params.new_base, + // params.len, + // device.deref_mut(), + // params.region_type, + // ) { + // error!( + // "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + // e, params.old_base, params.new_base, params.len + // ); + // } + error!( + "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", + params.old_base, params.new_base, params.len + ); + } + // Update the register value + device.write_config_register(register, offset, data) + } else { + None + } + } + + fn set_config_address(&mut self, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + let (mask, value): (u32, u32) = match data.len() { + 1 => ( + 0x0000_00ff << (offset * 8), + u32::from(data[0]) << (offset * 8), + ), + 2 => ( + 0x0000_ffff << (offset * 16), + (u32::from(data[1]) << 8 | u32::from(data[0])) << (offset * 16), + ), + 4 => (0xffff_ffff, LittleEndian::read_u32(data)), + _ => return, + }; + self.config_address = (self.config_address & !mask) | value; + } +} + +impl BusDevice for PciConfigIo { + fn read(&mut self, _: u64, offset: u64, data: &mut [u8]) { + // `offset` is relative to 0xcf8 + let value = match offset { + 0..=3 => self.config_address, + 4..=7 => self.config_space_read(), + _ => 0xffff_ffff, + }; + + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end <= 4 { + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } else { + for d in data { + *d = 0xff; + } + } + } + + fn write(&mut self, _: u64, offset: u64, data: &[u8]) { + // `offset` is relative to 0xcf8 + match offset { + o @ 0..=3 => { + self.set_config_address(o, data); + } + o @ 4..=7 => { + self.config_space_write(o - 4, data); + } + _ => {} + } + } +} + +/// Emulates PCI memory-mapped configuration access mechanism. +pub struct PciConfigMmio { + pci_bus: Arc>, +} + +impl PciConfigMmio { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigMmio { pci_bus } + } + + fn config_space_read(&self, config_address: u32) -> u32 { + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + self.pci_bus + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + fn config_space_write(&mut self, config_address: u32, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return; + } + + let pci_bus = self.pci_bus.lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + if let Some(params) = device.detect_bar_reprogramming(register, data) { + // if let Err(e) = pci_bus.device_reloc.move_bar( + // params.old_base, + // params.new_base, + // params.len, + // device.deref_mut(), + // params.region_type, + // ) { + // error!( + // "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + // e, params.old_base, params.new_base, params.len + // ); + // } + error!( + "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", + params.old_base, params.new_base, params.len + ); + } + + // Update the register value + device.write_config_register(register, offset, data); + } + } +} + +impl BusDevice for PciConfigMmio { + fn read(&mut self, _: u64, offset: u64, data: &mut [u8]) { + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end > 4 || offset > u64::from(u32::max_value()) { + for d in data { + *d = 0xff; + } + return; + } + + let value = self.config_space_read(offset as u32); + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } + + fn bus_write(&mut self, _: u64, offset: u64, data: &[u8]) { + if offset > u64::from(u32::max_value()) { + return; + } + self.config_space_write(offset as u32, offset % 4, data); + } +} + +fn shift_and_mask(value: u32, offset: usize, mask: u32) -> usize { + ((value >> offset) & mask) as usize +} + +// Parse the MMIO address offset to a (bus, device, function, register) tuple. +// See section 7.2.2 PCI Express Enhanced Configuration Access Mechanism (ECAM) +// from the Pci Express Base Specification Revision 5.0 Version 1.0. +fn parse_mmio_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 20; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 15; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 12; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3ff; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} + +// Parse the CONFIG_ADDRESS register to a (bus, device, function, register) tuple. +fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 16; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 11; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 8; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3f; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs new file mode 100644 index 00000000000..48753c1ef9a --- /dev/null +++ b/src/pci/src/configuration.rs @@ -0,0 +1,1008 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. + +use crate::{BarReprogrammingParams, MsixConfig, PciInterruptPin}; +use byteorder::{ByteOrder, LittleEndian}; +use std::fmt::{self, Display}; +use std::sync::{Arc, Mutex}; +use log::{debug, warn}; + +// The number of 32bit registers in the config space, 4096 bytes. +const NUM_CONFIGURATION_REGISTERS: usize = 1024; + +const STATUS_REG: usize = 1; +const STATUS_REG_CAPABILITIES_USED_MASK: u32 = 0x0010_0000; +const BAR0_REG: usize = 4; +const ROM_BAR_REG: usize = 12; +const BAR_IO_ADDR_MASK: u32 = 0xffff_fffc; +const BAR_MEM_ADDR_MASK: u32 = 0xffff_fff0; +const ROM_BAR_ADDR_MASK: u32 = 0xffff_f800; +const NUM_BAR_REGS: usize = 6; +const CAPABILITY_LIST_HEAD_OFFSET: usize = 0x34; +pub const FIRST_CAPABILITY_OFFSET: usize = 0x40; +pub const CAPABILITY_MAX_OFFSET: usize = 192; + +const INTERRUPT_LINE_PIN_REG: usize = 15; + +/// Represents the types of PCI headers allowed in the configuration registers. +#[derive(Copy, Clone)] +pub enum PciHeaderType { + Device, + Bridge, +} + +/// Classes of PCI nodes. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciClassCode { + TooOld, + MassStorage, + NetworkController, + DisplayController, + MultimediaController, + MemoryController, + BridgeDevice, + SimpleCommunicationController, + BaseSystemPeripheral, + InputDevice, + DockingStation, + Processor, + SerialBusController, + WirelessController, + IntelligentIoController, + EncryptionController, + DataAcquisitionSignalProcessing, + Other = 0xff, +} + +impl PciClassCode { + pub fn get_register_value(self) -> u8 { + self as u8 + } +} + +/// A PCI subclass. Each class in `PciClassCode` can specify a unique set of subclasses. This trait +/// is implemented by each subclass. It allows use of a trait object to generate configurations. +pub trait PciSubclass { + /// Convert this subclass to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Subclasses of the MultimediaController class. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMultimediaSubclass { + VideoController = 0x00, + AudioController = 0x01, + TelephonyDevice = 0x02, + AudioDevice = 0x03, + Other = 0x80, +} + +impl PciSubclass for PciMultimediaSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclasses of the BridgeDevice +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciBridgeSubclass { + HostBridge = 0x00, + IsaBridge = 0x01, + EisaBridge = 0x02, + McaBridge = 0x03, + PciToPciBridge = 0x04, + PcmciaBridge = 0x05, + NuBusBridge = 0x06, + CardBusBridge = 0x07, + RacEwayBridge = 0x08, + PciToPciSemiTransparentBridge = 0x09, + InfiniBrandToPciHostBridge = 0x0a, + OtherBridgeDevice = 0x80, +} + +impl PciSubclass for PciBridgeSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclass of the SerialBus +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciSerialBusSubClass { + Firewire = 0x00, + Accessbus = 0x01, + Ssa = 0x02, + Usb = 0x03, +} + +impl PciSubclass for PciSerialBusSubClass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Mass Storage Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMassStorageSubclass { + ScsiStorage = 0x00, + IdeInterface = 0x01, + FloppyController = 0x02, + IpiController = 0x03, + RaidController = 0x04, + AtaController = 0x05, + SataController = 0x06, + SerialScsiController = 0x07, + NvmController = 0x08, + MassStorage = 0x80, +} + +impl PciSubclass for PciMassStorageSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Network Controller Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciNetworkControllerSubclass { + EthernetController = 0x00, + TokenRingController = 0x01, + FddiController = 0x02, + AtmController = 0x03, + IsdnController = 0x04, + WorldFipController = 0x05, + PicmgController = 0x06, + InfinibandController = 0x07, + FabricController = 0x08, + NetworkController = 0x80, +} + +impl PciSubclass for PciNetworkControllerSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// A PCI class programming interface. Each combination of `PciClassCode` and +/// `PciSubclass` can specify a set of register-level programming interfaces. +/// This trait is implemented by each programming interface. +/// It allows use of a trait object to generate configurations. +pub trait PciProgrammingInterface { + /// Convert this programming interface to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Types of PCI capabilities. +#[derive(PartialEq, Copy, Clone)] +#[allow(dead_code)] +#[allow(non_camel_case_types)] +#[repr(C)] +pub enum PciCapabilityId { + ListId = 0, + PowerManagement = 0x01, + AcceleratedGraphicsPort = 0x02, + VitalProductData = 0x03, + SlotIdentification = 0x04, + MessageSignalledInterrupts = 0x05, + CompactPciHotSwap = 0x06, + PciX = 0x07, + HyperTransport = 0x08, + VendorSpecific = 0x09, + Debugport = 0x0A, + CompactPciCentralResourceControl = 0x0B, + PciStandardHotPlugController = 0x0C, + BridgeSubsystemVendorDeviceId = 0x0D, + AgpTargetPciPcibridge = 0x0E, + SecureDevice = 0x0F, + PciExpress = 0x10, + MsiX = 0x11, + SataDataIndexConf = 0x12, + PciAdvancedFeatures = 0x13, + PciEnhancedAllocation = 0x14, +} + +impl From for PciCapabilityId { + fn from(c: u8) -> Self { + match c { + 0 => PciCapabilityId::ListId, + 0x01 => PciCapabilityId::PowerManagement, + 0x02 => PciCapabilityId::AcceleratedGraphicsPort, + 0x03 => PciCapabilityId::VitalProductData, + 0x04 => PciCapabilityId::SlotIdentification, + 0x05 => PciCapabilityId::MessageSignalledInterrupts, + 0x06 => PciCapabilityId::CompactPciHotSwap, + 0x07 => PciCapabilityId::PciX, + 0x08 => PciCapabilityId::HyperTransport, + 0x09 => PciCapabilityId::VendorSpecific, + 0x0A => PciCapabilityId::Debugport, + 0x0B => PciCapabilityId::CompactPciCentralResourceControl, + 0x0C => PciCapabilityId::PciStandardHotPlugController, + 0x0D => PciCapabilityId::BridgeSubsystemVendorDeviceId, + 0x0E => PciCapabilityId::AgpTargetPciPcibridge, + 0x0F => PciCapabilityId::SecureDevice, + 0x10 => PciCapabilityId::PciExpress, + 0x11 => PciCapabilityId::MsiX, + 0x12 => PciCapabilityId::SataDataIndexConf, + 0x13 => PciCapabilityId::PciAdvancedFeatures, + 0x14 => PciCapabilityId::PciEnhancedAllocation, + _ => PciCapabilityId::ListId, + } + } +} + +/// A PCI capability list. Devices can optionally specify capabilities in their configuration space. +pub trait PciCapability { + fn bytes(&self) -> &[u8]; + fn id(&self) -> PciCapabilityId; +} + +fn encode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!(bar_size - 1)); + } + None +} + +fn decode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +fn encode_64_bits_bar_size(bar_size: u64) -> Option<(u32, u32)> { + if bar_size > 0 { + let result = !(bar_size - 1); + let result_hi = (result >> 32) as u32; + let result_lo = (result & 0xffff_ffff) as u32; + return Some((result_hi, result_lo)); + } + None +} + +fn decode_64_bits_bar_size(bar_size_hi: u32, bar_size_lo: u32) -> Option { + let bar_size: u64 = ((bar_size_hi as u64) << 32) | (bar_size_lo as u64); + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +#[derive(Default, Clone, Copy)] +struct PciBar { + addr: u32, + size: u32, + used: bool, + r#type: Option, +} + +/// Contains the configuration space of a PCI node. +/// See the [specification](https://en.wikipedia.org/wiki/PCI_configuration_space). +/// The configuration space is accessed with DWORD reads and writes from the guest. +pub struct PciConfiguration { + registers: [u32; NUM_CONFIGURATION_REGISTERS], + writable_bits: [u32; NUM_CONFIGURATION_REGISTERS], // writable bits for each register. + bars: [PciBar; NUM_BAR_REGS], + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + // Contains the byte offset and size of the last capability. + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, + msix_config: Option>>, +} + +/// See pci_regs.h in kernel +#[derive(Copy, Clone, PartialEq)] +pub enum PciBarRegionType { + Memory32BitRegion = 0, + IoRegion = 0x01, + Memory64BitRegion = 0x04, +} + +#[derive(Copy, Clone)] +pub enum PciBarPrefetchable { + NotPrefetchable = 0, + Prefetchable = 0x08, +} + +#[derive(Copy, Clone)] +pub struct PciBarConfiguration { + addr: u64, + size: u64, + reg_idx: usize, + region_type: PciBarRegionType, + prefetchable: PciBarPrefetchable, +} + +#[derive(Debug)] +pub enum Error { + BarAddressInvalid(u64, u64), + BarInUse(usize), + BarInUse64(usize), + BarInvalid(usize), + BarInvalid64(usize), + BarSizeInvalid(u64), + CapabilityEmpty, + CapabilityLengthInvalid(usize), + CapabilitySpaceFull(usize), + Decode32BarSize, + Decode64BarSize, + Encode32BarSize, + Encode64BarSize, + RomBarAddressInvalid(u64, u64), + RomBarInUse(usize), + RomBarInvalid(usize), + RomBarSizeInvalid(u64), +} +pub type Result = std::result::Result; + +impl std::error::Error for Error {} + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + match self { + BarAddressInvalid(a, s) => write!(f, "address {} size {} too big", a, s), + BarInUse(b) => write!(f, "bar {} already used", b), + BarInUse64(b) => write!(f, "64bit bar {} already used(requires two regs)", b), + BarInvalid(b) => write!(f, "bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + BarInvalid64(b) => write!( + f, + "64bitbar {} invalid, requires two regs, max {}", + b, + NUM_BAR_REGS - 1 + ), + BarSizeInvalid(s) => write!(f, "bar address {} not a power of two", s), + CapabilityEmpty => write!(f, "empty capabilities are invalid"), + CapabilityLengthInvalid(l) => write!(f, "Invalid capability length {}", l), + CapabilitySpaceFull(s) => write!(f, "capability of size {} doesn't fit", s), + Decode32BarSize => write!(f, "failed to decode 32 bits BAR size"), + Decode64BarSize => write!(f, "failed to decode 64 bits BAR size"), + Encode32BarSize => write!(f, "failed to encode 32 bits BAR size"), + Encode64BarSize => write!(f, "failed to encode 64 bits BAR size"), + RomBarAddressInvalid(a, s) => write!(f, "address {} size {} too big", a, s), + RomBarInUse(b) => write!(f, "rom bar {} already used", b), + RomBarInvalid(b) => write!(f, "rom bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + RomBarSizeInvalid(s) => write!(f, "rom bar address {} not a power of two", s), + } + } +} + +impl PciConfiguration { + #[allow(clippy::too_many_arguments)] + pub fn new( + vendor_id: u16, + device_id: u16, + revision_id: u8, + class_code: PciClassCode, + subclass: &dyn PciSubclass, + programming_interface: Option<&dyn PciProgrammingInterface>, + header_type: PciHeaderType, + subsystem_vendor_id: u16, + subsystem_id: u16, + msix_config: Option>>, + ) -> Self { + let mut registers = [0u32; NUM_CONFIGURATION_REGISTERS]; + let mut writable_bits = [0u32; NUM_CONFIGURATION_REGISTERS]; + registers[0] = u32::from(device_id) << 16 | u32::from(vendor_id); + // TODO(dverkamp): Status should be write-1-to-clear + writable_bits[1] = 0x0000_ffff; // Status (r/o), command (r/w) + let pi = if let Some(pi) = programming_interface { + pi.get_register_value() + } else { + 0 + }; + registers[2] = u32::from(class_code.get_register_value()) << 24 + | u32::from(subclass.get_register_value()) << 16 + | u32::from(pi) << 8 + | u32::from(revision_id); + writable_bits[3] = 0x0000_00ff; // Cacheline size (r/w) + match header_type { + PciHeaderType::Device => { + registers[3] = 0x0000_0000; // Header type 0 (device) + writable_bits[15] = 0x0000_00ff; // Interrupt line (r/w) + } + PciHeaderType::Bridge => { + registers[3] = 0x0001_0000; // Header type 1 (bridge) + writable_bits[9] = 0xfff0_fff0; // Memory base and limit + writable_bits[15] = 0xffff_00ff; // Bridge control (r/w), interrupt line (r/w) + } + }; + registers[11] = u32::from(subsystem_id) << 16 | u32::from(subsystem_vendor_id); + let bars = [PciBar::default(); NUM_BAR_REGS]; + + PciConfiguration { + registers, + writable_bits, + bars, + rom_bar_addr: 0, + rom_bar_size: 0, + rom_bar_used: false, + last_capability: None, + msix_cap_reg_idx: None, + msix_config, + } + } + + /// Reads a 32bit register from `reg_idx` in the register map. + pub fn read_reg(&self, reg_idx: usize) -> u32 { + *(self.registers.get(reg_idx).unwrap_or(&0xffff_ffff)) + } + + /// Writes a 32bit register to `reg_idx` in the register map. + pub fn write_reg(&mut self, reg_idx: usize, value: u32) { + let mut mask = self.writable_bits[reg_idx]; + + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Handle very specific case where the BAR is being written with + // all 1's to retrieve the BAR size during next BAR reading. + if value == 0xffff_ffff { + mask &= self.bars[reg_idx - 4].size; + } + } else if reg_idx == ROM_BAR_REG { + // Handle very specific case where the BAR is being written with + // all 1's on bits 31-11 to retrieve the BAR size during next BAR + // reading. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + mask &= self.rom_bar_size; + } + } + + if let Some(r) = self.registers.get_mut(reg_idx) { + *r = (*r & !self.writable_bits[reg_idx]) | (value & mask); + } else { + warn!("bad PCI register write {}", reg_idx); + } + } + + /// Writes a 16bit word to `offset`. `offset` must be 16bit aligned. + pub fn write_word(&mut self, offset: usize, value: u16) { + let shift = match offset % 4 { + 0 => 0, + 2 => 16, + _ => { + warn!("bad PCI config write offset {}", offset); + return; + } + }; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = self.writable_bits[reg_idx]; + let mask = (0xffffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Writes a byte to `offset`. + pub fn write_byte(&mut self, offset: usize, value: u8) { + self.write_byte_internal(offset, value, true); + } + + /// Writes a byte to `offset`, optionally enforcing read-only bits. + fn write_byte_internal(&mut self, offset: usize, value: u8, apply_writable_mask: bool) { + let shift = (offset % 4) * 8; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = if apply_writable_mask { + self.writable_bits[reg_idx] + } else { + 0xffff_ffff + }; + let mask = (0xffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Adds a region specified by `config`. Configures the specified BAR(s) to + /// report this region and size to the guest kernel. Enforces a few constraints + /// (i.e, region size must be power of two, register not already used). Returns 'None' on + /// failure all, `Some(BarIndex)` on success. + pub fn add_pci_bar(&mut self, config: &PciBarConfiguration) -> Result { + if self.bars[config.reg_idx].used { + return Err(Error::BarInUse(config.reg_idx)); + } + + if config.size.count_ones() != 1 { + return Err(Error::BarSizeInvalid(config.size)); + } + + if config.reg_idx >= NUM_BAR_REGS { + return Err(Error::BarInvalid(config.reg_idx)); + } + + let bar_idx = BAR0_REG + config.reg_idx; + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::BarAddressInvalid(config.addr, config.size))?; + match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::IoRegion => { + if end_addr > u64::from(u32::max_value()) { + return Err(Error::BarAddressInvalid(config.addr, config.size)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + self.bars[config.reg_idx].size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + } + PciBarRegionType::Memory64BitRegion => { + if config.reg_idx + 1 >= NUM_BAR_REGS { + return Err(Error::BarInvalid64(config.reg_idx)); + } + + if end_addr > u64::max_value() { + return Err(Error::BarAddressInvalid(config.addr, config.size)); + } + + if self.bars[config.reg_idx + 1].used { + return Err(Error::BarInUse64(config.reg_idx)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + let (bar_size_hi, bar_size_lo) = + encode_64_bits_bar_size(config.size).ok_or(Error::Encode64BarSize)?; + + self.registers[bar_idx + 1] = (config.addr >> 32) as u32; + self.writable_bits[bar_idx + 1] = 0xffff_ffff; + self.bars[config.reg_idx + 1].addr = self.registers[bar_idx + 1]; + self.bars[config.reg_idx].size = bar_size_lo; + self.bars[config.reg_idx + 1].size = bar_size_hi; + self.bars[config.reg_idx + 1].used = true; + } + } + + let (mask, lower_bits) = match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => ( + BAR_MEM_ADDR_MASK, + config.prefetchable as u32 | config.region_type as u32, + ), + PciBarRegionType::IoRegion => (BAR_IO_ADDR_MASK, config.region_type as u32), + }; + + self.registers[bar_idx] = ((config.addr as u32) & mask) | lower_bits; + self.writable_bits[bar_idx] = mask; + self.bars[config.reg_idx].addr = self.registers[bar_idx]; + self.bars[config.reg_idx].used = true; + self.bars[config.reg_idx].r#type = Some(config.region_type); + Ok(config.reg_idx) + } + + /// Adds rom expansion BAR. + pub fn add_pci_rom_bar(&mut self, config: &PciBarConfiguration, active: u32) -> Result { + if self.rom_bar_used { + return Err(Error::RomBarInUse(config.reg_idx)); + } + + if config.size.count_ones() != 1 { + return Err(Error::RomBarSizeInvalid(config.size)); + } + + if config.reg_idx != ROM_BAR_REG { + return Err(Error::RomBarInvalid(config.reg_idx)); + } + + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::RomBarAddressInvalid(config.addr, config.size))?; + + if end_addr > u64::from(u32::max_value()) { + return Err(Error::RomBarAddressInvalid(config.addr, config.size)); + } + + self.registers[config.reg_idx] = (config.addr as u32) | active; + self.writable_bits[config.reg_idx] = ROM_BAR_ADDR_MASK; + self.rom_bar_addr = self.registers[config.reg_idx]; + self.rom_bar_size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + self.rom_bar_used = true; + Ok(config.reg_idx) + } + + /// Returns the address of the given BAR region. + pub fn get_bar_addr(&self, bar_num: usize) -> u64 { + let bar_idx = BAR0_REG + bar_num; + + let mut addr = u64::from(self.bars[bar_num].addr & self.writable_bits[bar_idx]); + + if let Some(bar_type) = self.bars[bar_num].r#type { + if bar_type == PciBarRegionType::Memory64BitRegion { + addr |= u64::from(self.bars[bar_num + 1].addr) << 32; + } + } + + addr + } + + /// Configures the IRQ line and pin used by this device. + pub fn set_irq(&mut self, line: u8, pin: PciInterruptPin) { + // `pin` is 1-based in the pci config space. + let pin_idx = (pin as u32) + 1; + self.registers[INTERRUPT_LINE_PIN_REG] = (self.registers[INTERRUPT_LINE_PIN_REG] + & 0xffff_0000) + | (pin_idx << 8) + | u32::from(line); + } + + /// Adds the capability `cap_data` to the list of capabilities. + /// `cap_data` should include the two-byte PCI capability header (type, next), + /// but not populate it. Correct values will be generated automatically based + /// on `cap_data.id()`. + pub fn add_capability(&mut self, cap_data: &dyn PciCapability) -> Result { + let total_len = cap_data.bytes().len(); + // Check that the length is valid. + if cap_data.bytes().is_empty() { + return Err(Error::CapabilityEmpty); + } + let (cap_offset, tail_offset) = match self.last_capability { + Some((offset, len)) => (Self::next_dword(offset, len), offset + 1), + None => (FIRST_CAPABILITY_OFFSET, CAPABILITY_LIST_HEAD_OFFSET), + }; + let end_offset = cap_offset + .checked_add(total_len) + .ok_or(Error::CapabilitySpaceFull(total_len))?; + if end_offset > CAPABILITY_MAX_OFFSET { + return Err(Error::CapabilitySpaceFull(total_len)); + } + self.registers[STATUS_REG] |= STATUS_REG_CAPABILITIES_USED_MASK; + self.write_byte_internal(tail_offset, cap_offset as u8, false); + self.write_byte_internal(cap_offset, cap_data.id() as u8, false); + self.write_byte_internal(cap_offset + 1, 0, false); // Next pointer. + for (i, byte) in cap_data.bytes().iter().enumerate() { + self.write_byte_internal(cap_offset + i + 2, *byte, false); + } + self.last_capability = Some((cap_offset, total_len)); + + if cap_data.id() == PciCapabilityId::MsiX { + self.msix_cap_reg_idx = Some(cap_offset / 4); + } + + Ok(cap_offset) + } + + // Find the next aligned offset after the one given. + fn next_dword(offset: usize, len: usize) -> usize { + let next = offset + len; + (next + 3) & !3 + } + + pub fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + // Handle potential write to MSI-X message control register + if let Some(msix_cap_reg_idx) = self.msix_cap_reg_idx { + if let Some(msix_config) = &self.msix_config { + if msix_cap_reg_idx == reg_idx && offset == 2 && data.len() == 2 { + msix_config + .lock() + .unwrap() + .set_msg_ctl(LittleEndian::read_u16(data)); + } + } + } + + match data.len() { + 1 => self.write_byte(reg_idx * 4 + offset as usize, data[0]), + 2 => self.write_word( + reg_idx * 4 + offset as usize, + u16::from(data[0]) | u16::from(data[1]) << 8, + ), + 4 => self.write_reg(reg_idx, LittleEndian::read_u32(data)), + _ => (), + } + } + + pub fn read_config_register(&self, reg_idx: usize) -> u32 { + self.read_reg(reg_idx) + } + + pub fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + if data.len() != 4 { + return None; + } + + let value = LittleEndian::read_u32(data); + + let mask = self.writable_bits[reg_idx]; + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + let bar_idx = reg_idx - 4; + if (value & mask) != (self.bars[bar_idx].addr & mask) { + // Handle special case where the address being written is + // different from the address initially provided. This is a + // BAR reprogramming case which needs to be properly caught. + if let Some(bar_type) = self.bars[bar_idx].r#type { + match bar_type { + PciBarRegionType::Memory64BitRegion => {} + _ => { + // Ignore the case where the BAR size is being + // asked for. + if value == 0xffff_ffff { + return None; + } + + debug!( + "DETECT BAR REPROG: current 0x{:x}, new 0x{:x}", + self.registers[reg_idx], value + ); + let old_base = u64::from(self.bars[bar_idx].addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.bars[bar_idx].size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = bar_type; + + self.bars[bar_idx].addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + } + } else if (reg_idx > BAR0_REG) + && (self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]) + != (self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]) + { + // Ignore the case where the BAR size is being asked for. + // Because we are in the 64bits case here, we have to check + // if the lower 32bits of the current BAR have already been + // asked for the BAR size too. + if value == 0xffff_ffff + && self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1] + == self.bars[bar_idx - 1].size & self.writable_bits[reg_idx - 1] + { + return None; + } + + debug!( + "DETECT BAR REPROG: current 0x{:x}, new 0x{:x}", + self.registers[reg_idx], value + ); + let old_base = u64::from(self.bars[bar_idx].addr & mask) << 32 + | u64::from(self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]); + let new_base = u64::from(value & mask) << 32 + | u64::from(self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]); + let len = decode_64_bits_bar_size( + self.bars[bar_idx].size, + self.bars[bar_idx - 1].size, + ) + .ok_or(Error::Decode64BarSize) + .unwrap(); + let region_type = PciBarRegionType::Memory64BitRegion; + + self.bars[bar_idx].addr = value; + self.bars[bar_idx - 1].addr = self.registers[reg_idx - 1]; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + } + } else if reg_idx == ROM_BAR_REG && (value & mask) != (self.rom_bar_addr & mask) { + // Ignore the case where the BAR size is being asked for. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + return None; + } + + debug!( + "DETECT ROM BAR REPROG: current 0x{:x}, new 0x{:x}", + self.registers[reg_idx], value + ); + let old_base = u64::from(self.rom_bar_addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.rom_bar_size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = PciBarRegionType::Memory32BitRegion; + + self.rom_bar_addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + + None + } +} + +impl Default for PciBarConfiguration { + fn default() -> Self { + PciBarConfiguration { + reg_idx: 0, + addr: 0, + size: 0, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::NotPrefetchable, + } + } +} + +impl PciBarConfiguration { + pub fn new( + reg_idx: usize, + size: u64, + region_type: PciBarRegionType, + prefetchable: PciBarPrefetchable, + ) -> Self { + PciBarConfiguration { + reg_idx, + addr: 0, + size, + region_type, + prefetchable, + } + } + + pub fn set_register_index(mut self, reg_idx: usize) -> Self { + self.reg_idx = reg_idx; + self + } + + pub fn set_address(mut self, addr: u64) -> Self { + self.addr = addr; + self + } + + pub fn set_size(mut self, size: u64) -> Self { + self.size = size; + self + } + + pub fn get_size(&self) -> u64 { + self.size + } + + pub fn set_region_type(mut self, region_type: PciBarRegionType) -> Self { + self.region_type = region_type; + self + } + pub fn set_prefetch(mut self, prefetchable: PciBarPrefetchable) -> Self { + self.prefetchable = prefetchable; + self + } +} + +#[cfg(test)] +mod tests { + use vm_memory::ByteValued; + + use super::*; + + #[repr(packed)] + #[derive(Clone, Copy, Default)] + #[allow(dead_code)] + struct TestCap { + len: u8, + foo: u8, + } + + // It is safe to implement BytesValued; all members are simple numbers and any value is valid. + unsafe impl ByteValued for TestCap {} + + impl PciCapability for TestCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } + } + + #[test] + fn add_capability() { + let mut cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + None, + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + ); + + // Add two capabilities with different contents. + let cap1 = TestCap { len: 4, foo: 0xAA }; + let cap1_offset = cfg.add_capability(&cap1).unwrap(); + assert_eq!(cap1_offset % 4, 0); + + let cap2 = TestCap { + len: 0x04, + foo: 0x55, + }; + let cap2_offset = cfg.add_capability(&cap2).unwrap(); + assert_eq!(cap2_offset % 4, 0); + + // The capability list head should be pointing to cap1. + let cap_ptr = cfg.read_reg(CAPABILITY_LIST_HEAD_OFFSET / 4) & 0xFF; + assert_eq!(cap1_offset, cap_ptr as usize); + + // Verify the contents of the capabilities. + let cap1_data = cfg.read_reg(cap1_offset / 4); + assert_eq!(cap1_data & 0xFF, 0x09); // capability ID + assert_eq!((cap1_data >> 8) & 0xFF, cap2_offset as u32); // next capability pointer + assert_eq!((cap1_data >> 16) & 0xFF, 0x04); // cap1.len + assert_eq!((cap1_data >> 24) & 0xFF, 0xAA); // cap1.foo + + let cap2_data = cfg.read_reg(cap2_offset / 4); + assert_eq!(cap2_data & 0xFF, 0x09); // capability ID + assert_eq!((cap2_data >> 8) & 0xFF, 0x00); // next capability pointer + assert_eq!((cap2_data >> 16) & 0xFF, 0x04); // cap2.len + assert_eq!((cap2_data >> 24) & 0xFF, 0x55); // cap2.foo + } + + #[derive(Copy, Clone)] + enum TestPi { + Test = 0x5a, + } + + impl PciProgrammingInterface for TestPi { + fn get_register_value(&self) -> u8 { + *self as u8 + } + } + + #[test] + fn class_code() { + let cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + Some(&TestPi::Test), + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + ); + + let class_reg = cfg.read_reg(2); + let class_code = (class_reg >> 24) & 0xFF; + let subclass = (class_reg >> 16) & 0xFF; + let prog_if = (class_reg >> 8) & 0xFF; + assert_eq!(class_code, 0x04); + assert_eq!(subclass, 0x01); + assert_eq!(prog_if, 0x5a); + } +} diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs new file mode 100644 index 00000000000..1e9ef45fe5b --- /dev/null +++ b/src/pci/src/device.rs @@ -0,0 +1,116 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. + +use crate::configuration::{self, PciBarRegionType}; +use std::any::Any; +use std::fmt::{self, Display}; +use std::sync::{Arc, Barrier}; +use std::{self, io, result}; +use vm_memory::{GuestAddress, GuestUsize}; +use vm_system_allocator::SystemAllocator; + +#[derive(Debug)] +pub enum Error { + /// Setup of the device capabilities failed. + CapabilitiesSetup(configuration::Error), + /// Allocating space for an IO BAR failed. + IoAllocationFailed(u64), + /// Registering an IO BAR failed. + IoRegistrationFailed(u64, configuration::Error), +} +pub type Result = std::result::Result; + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + + match self { + CapabilitiesSetup(e) => write!(f, "failed to add capability {}", e), + IoAllocationFailed(size) => { + write!(f, "failed to allocate space for an IO BAR, size={}", size) + } + IoRegistrationFailed(addr, e) => { + write!(f, "failed to register an IO BAR, addr={} err={}", addr, e) + } + } + } +} + +#[derive(Clone, Copy)] +pub struct BarReprogrammingParams { + pub old_base: u64, + pub new_base: u64, + pub len: u64, + pub region_type: PciBarRegionType, +} + +pub trait PciDevice { + /// Allocates the needed PCI BARs space using the `allocate` function which takes a size and + /// returns an address. Returns a Vec of (GuestAddress, GuestUsize) tuples. + fn allocate_bars( + &mut self, + _allocator: &mut SystemAllocator, + ) -> Result> { + Ok(Vec::new()) + } + + /// Frees the PCI BARs previously allocated with a call to allocate_bars(). + fn free_bars(&mut self, _allocator: &mut SystemAllocator) -> Result<()> { + Ok(()) + } + + /// Sets a register in the configuration space. + /// * `reg_idx` - The index of the config register to modify. + /// * `offset` - Offset in to the register. + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option>; + /// Gets a register from the configuration space. + /// * `reg_idx` - The index of the config register to read. + fn read_config_register(&mut self, reg_idx: usize) -> u32; + + /// Reads from a BAR region mapped in to the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - Filled with the data from `addr`. + fn read_bar(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} + /// Writes to a BAR region mapped in to the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - The data to write. + fn write_bar(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + /// Relocates the BAR to a different address in guest address space. + fn move_bar(&mut self, _old_base: u64, _new_base: u64) -> result::Result<(), io::Error> { + Ok(()) + } + /// Provides a mutable reference to the Any trait. This is useful to let + /// the caller have access to the underlying type behind the trait. + fn as_any(&mut self) -> &mut dyn Any; + /// Detects if a BAR is being reprogrammed. + fn detect_bar_reprogramming( + &mut self, + _reg_idx: usize, + _data: &[u8], + ) -> Option { + None + } +} + +/// This trait defines a set of functions which can be triggered whenever a +/// PCI device is modified in any way. +pub trait DeviceRelocation: Send + Sync { + /// The BAR needs to be moved to a different location in the guest address + /// space. This follows a decision from the software running in the guest. + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + pci_dev: &mut dyn PciDevice, + region_type: PciBarRegionType, + ) -> result::Result<(), io::Error>; +} diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs new file mode 100644 index 00000000000..e95b1dd03a5 --- /dev/null +++ b/src/pci/src/lib.rs @@ -0,0 +1,42 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. + +//! Implements pci devices and busses. + +// mod bus; +pub mod configuration; +pub mod device; +pub mod msi; +pub mod msix; +pub mod vfio; + +// pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; +pub use self::device::{ + BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, +}; + +pub use self::configuration::{ + PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, + PciClassCode, PciConfiguration, PciHeaderType, PciMassStorageSubclass, + PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, +}; + +pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; +pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_TABLE_ENTRY_SIZE}; + +pub use self::vfio::{VfioPciDevice, VfioPciError}; +/// PCI has four interrupt pins A->D. +#[derive(Copy, Clone)] +pub enum PciInterruptPin { + IntA, + IntB, + IntC, + IntD, +} + +impl PciInterruptPin { + pub fn to_mask(self) -> u32 { + self as u32 + } +} diff --git a/src/pci/src/msi.rs b/src/pci/src/msi.rs new file mode 100644 index 00000000000..0485f4c2f71 --- /dev/null +++ b/src/pci/src/msi.rs @@ -0,0 +1,236 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +extern crate byteorder; +extern crate vm_memory; + +use byteorder::{ByteOrder, LittleEndian}; +use std::sync::Arc; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; + +use log::error; + +// MSI control masks +const MSI_CTL_ENABLE: u16 = 0x1; +const MSI_CTL_MULTI_MSG_ENABLE: u16 = 0x70; +const MSI_CTL_64_BITS: u16 = 0x80; +const MSI_CTL_PER_VECTOR: u16 = 0x100; + +// MSI message offsets +const MSI_MSG_CTL_OFFSET: u64 = 0x2; +const MSI_MSG_ADDR_LO_OFFSET: u64 = 0x4; + +// MSI message masks +const MSI_MSG_ADDR_LO_MASK: u32 = 0xffff_fffc; + +pub fn msi_num_enabled_vectors(msg_ctl: u16) -> usize { + let field = (msg_ctl >> 4) & 0x7; + + if field > 5 { + return 0; + } + + 1 << field +} + +#[derive(Clone, Copy, Default)] +pub struct MsiCap { + // Message Control Register + // 0: MSI enable. + // 3-1; Multiple message capable. + // 6-4: Multiple message enable. + // 7: 64 bits address capable. + // 8: Per-vector masking capable. + // 15-9: Reserved. + pub msg_ctl: u16, + // Message Address (LSB) + // 1-0: Reserved. + // 31-2: Message address. + pub msg_addr_lo: u32, + // Message Upper Address (MSB) + // 31-0: Message address. + pub msg_addr_hi: u32, + // Message Data + // 15-0: Message data. + pub msg_data: u16, + // Mask Bits + // 31-0: Mask bits. + pub mask_bits: u32, + // Pending Bits + // 31-0: Pending bits. + pub pending_bits: u32, +} + +impl MsiCap { + fn addr_64_bits(&self) -> bool { + self.msg_ctl & MSI_CTL_64_BITS == MSI_CTL_64_BITS + } + + fn per_vector_mask(&self) -> bool { + self.msg_ctl & MSI_CTL_PER_VECTOR == MSI_CTL_PER_VECTOR + } + + fn enabled(&self) -> bool { + self.msg_ctl & MSI_CTL_ENABLE == MSI_CTL_ENABLE + } + + fn num_enabled_vectors(&self) -> usize { + msi_num_enabled_vectors(self.msg_ctl) + } + + fn vector_masked(&self, vector: usize) -> bool { + if !self.per_vector_mask() { + return false; + } + + (self.mask_bits >> vector) & 0x1 == 0x1 + } + + fn size(&self) -> u64 { + let mut size: u64 = 0xa; + + if self.addr_64_bits() { + size += 0x4; + } + if self.per_vector_mask() { + size += 0xa; + } + + size + } + + fn update(&mut self, offset: u64, data: &[u8]) { + // Calculate message data offset depending on the address being 32 or + // 64 bits. + // Calculate upper address offset if the address is 64 bits. + // Calculate mask bits offset based on the address being 32 or 64 bits + // and based on the per vector masking being enabled or not. + let (msg_data_offset, addr_hi_offset, mask_bits_offset): (u64, Option, Option) = + if self.addr_64_bits() { + let mask_bits = if self.per_vector_mask() { + Some(0x10) + } else { + None + }; + (0xc, Some(0x8), mask_bits) + } else { + let mask_bits = if self.per_vector_mask() { + Some(0xc) + } else { + None + }; + (0x8, None, mask_bits) + }; + + // Update cache without overriding the read-only bits. + match data.len() { + 2 => { + let value = LittleEndian::read_u16(data); + match offset { + MSI_MSG_CTL_OFFSET => { + self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + | (value & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + } + x if x == msg_data_offset => self.msg_data = value, + _ => error!("invalid offset"), + } + } + 4 => { + let value = LittleEndian::read_u32(data); + match offset { + 0x0 => { + self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + | ((value >> 16) as u16 & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + } + MSI_MSG_ADDR_LO_OFFSET => self.msg_addr_lo = value & MSI_MSG_ADDR_LO_MASK, + x if x == msg_data_offset => self.msg_data = value as u16, + x if addr_hi_offset.is_some() && x == addr_hi_offset.unwrap() => { + self.msg_addr_hi = value + } + x if mask_bits_offset.is_some() && x == mask_bits_offset.unwrap() => { + self.mask_bits = value + } + _ => error!("invalid offset"), + } + } + _ => error!("invalid data length"), + } + } +} + +pub struct MsiConfig { + cap: MsiCap, + interrupt_source_group: Arc>, +} + +impl MsiConfig { + pub fn new(msg_ctl: u16, interrupt_source_group: Arc>) -> Self { + let cap = MsiCap { + msg_ctl, + ..Default::default() + }; + + MsiConfig { + cap, + interrupt_source_group, + } + } + + pub fn enabled(&self) -> bool { + self.cap.enabled() + } + + pub fn size(&self) -> u64 { + self.cap.size() + } + + pub fn num_enabled_vectors(&self) -> usize { + self.cap.num_enabled_vectors() + } + + pub fn update(&mut self, offset: u64, data: &[u8]) { + let old_enabled = self.cap.enabled(); + + self.cap.update(offset, data); + + if self.cap.enabled() { + for idx in 0..self.num_enabled_vectors() { + let config = MsiIrqSourceConfig { + high_addr: self.cap.msg_addr_hi, + low_addr: self.cap.msg_addr_lo, + data: self.cap.msg_data as u32, + devid: 0, + }; + + if let Err(e) = self + .interrupt_source_group + .update(idx as InterruptIndex, InterruptSourceConfig::MsiIrq(config)) + { + error!("Failed updating vector: {:?}", e); + } + + if self.cap.vector_masked(idx) { + if let Err(e) = self.interrupt_source_group.mask(idx as InterruptIndex) { + error!("Failed masking vector: {:?}", e); + } + } else if let Err(e) = self.interrupt_source_group.unmask(idx as InterruptIndex) { + error!("Failed unmasking vector: {:?}", e); + } + } + + if !old_enabled { + if let Err(e) = self.interrupt_source_group.enable() { + error!("Failed enabling irq_fd: {:?}", e); + } + } + } else if old_enabled { + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } +} diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs new file mode 100644 index 00000000000..d5d7eb6bec6 --- /dev/null +++ b/src/pci/src/msix.rs @@ -0,0 +1,453 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +extern crate byteorder; +extern crate vm_memory; + +use crate::{PciCapability, PciCapabilityId}; +use byteorder::{ByteOrder, LittleEndian}; +use std::sync::Arc; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; +use vm_memory::ByteValued; + +use log::{debug, error}; + +const MAX_MSIX_VECTORS_PER_DEVICE: u16 = 2048; +const MSIX_TABLE_ENTRIES_MODULO: u64 = 16; +const MSIX_PBA_ENTRIES_MODULO: u64 = 8; +const BITS_PER_PBA_ENTRY: usize = 64; +const FUNCTION_MASK_BIT: u8 = 14; +const MSIX_ENABLE_BIT: u8 = 15; +const FUNCTION_MASK_MASK: u16 = (1 << FUNCTION_MASK_BIT) as u16; +const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; +pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; + +#[derive(Debug, Clone)] +pub struct MsixTableEntry { + pub msg_addr_lo: u32, + pub msg_addr_hi: u32, + pub msg_data: u32, + pub vector_ctl: u32, +} + +impl MsixTableEntry { + pub fn masked(&self) -> bool { + self.vector_ctl & 0x1 == 0x1 + } +} + +impl Default for MsixTableEntry { + fn default() -> Self { + MsixTableEntry { + msg_addr_lo: 0, + msg_addr_hi: 0, + msg_data: 0, + vector_ctl: 0x1, + } + } +} + +pub struct MsixConfig { + pub table_entries: Vec, + pub pba_entries: Vec, + pub devid: u32, + interrupt_source_group: Arc>, + masked: bool, + enabled: bool, +} + +impl MsixConfig { + pub fn new( + msix_vectors: u16, + interrupt_source_group: Arc>, + devid: u32, + ) -> Self { + assert!(msix_vectors <= MAX_MSIX_VECTORS_PER_DEVICE); + + let mut table_entries: Vec = Vec::new(); + table_entries.resize_with(msix_vectors as usize, Default::default); + let mut pba_entries: Vec = Vec::new(); + let num_pba_entries: usize = ((msix_vectors as usize) / BITS_PER_PBA_ENTRY) + 1; + pba_entries.resize_with(num_pba_entries, Default::default); + + MsixConfig { + table_entries, + pba_entries, + devid, + interrupt_source_group, + masked: true, + enabled: false, + } + } + + pub fn masked(&self) -> bool { + self.masked + } + + pub fn enabled(&self) -> bool { + self.enabled + } + + pub fn set_msg_ctl(&mut self, reg: u16) { + let old_masked = self.masked; + let old_enabled = self.enabled; + + self.masked = ((reg >> FUNCTION_MASK_BIT) & 1u16) == 1u16; + self.enabled = ((reg >> MSIX_ENABLE_BIT) & 1u16) == 1u16; + + // Update interrupt routing + if old_masked != self.masked || old_enabled != self.enabled { + if self.enabled && !self.masked { + for (idx, table_entry) in self.table_entries.iter().enumerate() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self + .interrupt_source_group + .update(idx as InterruptIndex, InterruptSourceConfig::MsiIrq(config)) + { + error!("Failed updating vector: {:?}", e); + } + + if table_entry.masked() { + if let Err(e) = self.interrupt_source_group.mask(idx as InterruptIndex) { + error!("Failed masking vector: {:?}", e); + } + } else if let Err(e) = self.interrupt_source_group.unmask(idx as InterruptIndex) + { + error!("Failed unmasking vector: {:?}", e); + } + } + } else if old_enabled || !old_masked { + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } + + // If the Function Mask bit was set, and has just been cleared, it's + // important to go through the entire PBA to check if there was any + // pending MSI-X message to inject, given that the vector is not + // masked. + if old_masked && !self.masked { + for (index, entry) in self.table_entries.clone().iter().enumerate() { + if !entry.masked() && self.get_pba_bit(index as u16) == 1 { + self.inject_msix_and_clear_pba(index); + } + } + } + } + + pub fn read_table(&self, offset: u64, data: &mut [u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + match data.len() { + 4 => { + let value = match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo, + 0x4 => self.table_entries[index].msg_addr_hi, + 0x8 => self.table_entries[index].msg_data, + 0xc => self.table_entries[index].vector_ctl, + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u32(data, value); + } + 8 => { + let value = match modulo_offset { + 0x0 => { + (u64::from(self.table_entries[index].msg_addr_hi) << 32) + | u64::from(self.table_entries[index].msg_addr_lo) + } + 0x8 => { + (u64::from(self.table_entries[index].vector_ctl) << 32) + | u64::from(self.table_entries[index].msg_data) + } + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u64(data, value); + } + _ => { + error!("invalid data length"); + } + } + } + + pub fn write_table(&mut self, offset: u64, data: &[u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + // Store the value of the entry before modification + let mut old_entry: Option = None; + + match data.len() { + 4 => { + let value = LittleEndian::read_u32(data); + match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo = value, + 0x4 => self.table_entries[index].msg_addr_hi = value, + 0x8 => self.table_entries[index].msg_data = value, + 0xc => { + old_entry = Some(self.table_entries[index].clone()); + self.table_entries[index].vector_ctl = value; + } + _ => error!("invalid offset"), + }; + + debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); + } + 8 => { + let value = LittleEndian::read_u64(data); + match modulo_offset { + 0x0 => { + self.table_entries[index].msg_addr_lo = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].msg_addr_hi = (value >> 32) as u32; + } + 0x8 => { + old_entry = Some(self.table_entries[index].clone()); + self.table_entries[index].msg_data = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].vector_ctl = (value >> 32) as u32; + } + _ => error!("invalid offset"), + }; + + debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); + } + _ => error!("invalid data length"), + }; + + // Update interrupt routes + if self.enabled && !self.masked { + let table_entry = &self.table_entries[index]; + + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + index as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + ) { + error!("Failed updating vector: {:?}", e); + } + + if table_entry.masked() { + if let Err(e) = self.interrupt_source_group.mask(index as InterruptIndex) { + error!("Failed masking vector: {:?}", e); + } + } else if let Err(e) = self.interrupt_source_group.unmask(index as InterruptIndex) { + error!("Failed unmasking vector: {:?}", e); + } + } + + // After the MSI-X table entry has been updated, it is necessary to + // check if the vector control masking bit has changed. In case the + // bit has been flipped from 1 to 0, we need to inject a MSI message + // if the corresponding pending bit from the PBA is set. Once the MSI + // has been injected, the pending bit in the PBA needs to be cleared. + // All of this is valid only if MSI-X has not been masked for the whole + // device. + if let Some(old_entry) = old_entry { + // Check if bit has been flipped + if !self.masked() + && old_entry.masked() + && !self.table_entries[index].masked() + && self.get_pba_bit(index as u16) == 1 + { + self.inject_msix_and_clear_pba(index); + } + } + } + + pub fn read_pba(&mut self, offset: u64, data: &mut [u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO; + + match data.len() { + 4 => { + let value: u32 = match modulo_offset { + 0x0 => (self.pba_entries[index] & 0xffff_ffffu64) as u32, + 0x4 => (self.pba_entries[index] >> 32) as u32, + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u32(data, value); + } + 8 => { + let value: u64 = match modulo_offset { + 0x0 => self.pba_entries[index], + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u64(data, value); + } + _ => { + error!("invalid data length"); + } + } + } + + pub fn write_pba(&mut self, _offset: u64, _data: &[u8]) { + error!("Pending Bit Array is read only"); + } + + pub fn set_pba_bit(&mut self, vector: u16, reset: bool) { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + let mut mask: u64 = (1 << shift) as u64; + + if reset { + mask = !mask; + self.pba_entries[index] &= mask; + } else { + self.pba_entries[index] |= mask; + } + } + + fn get_pba_bit(&self, vector: u16) -> u8 { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + + ((self.pba_entries[index] >> shift) & 0x0000_0001u64) as u8 + } + + fn inject_msix_and_clear_pba(&mut self, vector: usize) { + // Inject the MSI message + match self + .interrupt_source_group + .trigger(vector as InterruptIndex) + { + Ok(_) => debug!("MSI-X injected on vector control flip"), + Err(e) => error!("failed to inject MSI-X: {}", e), + } + + // Clear the bit from PBA + self.set_pba_bit(vector as u16, true); + } +} + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Default)] +pub struct MsixCap { + // Message Control Register + // 10-0: MSI-X Table size + // 13-11: Reserved + // 14: Mask. Mask all MSI-X when set. + // 15: Enable. Enable all MSI-X when set. + pub msg_ctl: u16, + // Table. Contains the offset and the BAR indicator (BIR) + // 2-0: Table BAR indicator (BIR). Can be 0 to 5. + // 31-3: Table offset in the BAR pointed by the BIR. + pub table: u32, + // Pending Bit Array. Contains the offset and the BAR indicator (BIR) + // 2-0: PBA BAR indicator (BIR). Can be 0 to 5. + // 31-3: PBA offset in the BAR pointed by the BIR. + pub pba: u32, +} + +// It is safe to implement ByteValued. All members are simple numbers and any value is valid. +unsafe impl ByteValued for MsixCap {} + +impl PciCapability for MsixCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::MsiX + } +} + +impl MsixCap { + pub fn new( + table_pci_bar: u8, + table_size: u16, + table_off: u32, + pba_pci_bar: u8, + pba_off: u32, + ) -> Self { + assert!(table_size < MAX_MSIX_VECTORS_PER_DEVICE); + + // Set the table size and enable MSI-X. + let msg_ctl: u16 = 0x8000u16 + table_size - 1; + + MsixCap { + msg_ctl, + table: (table_off & 0xffff_fff8u32) | u32::from(table_pci_bar & 0x7u8), + pba: (pba_off & 0xffff_fff8u32) | u32::from(pba_pci_bar & 0x7u8), + } + } + + pub fn set_msg_ctl(&mut self, data: u16) { + self.msg_ctl = (self.msg_ctl & !(FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)) + | (data & (FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)); + } + + pub fn masked(&self) -> bool { + (self.msg_ctl >> FUNCTION_MASK_BIT) & 0x1 == 0x1 + } + + pub fn enabled(&self) -> bool { + (self.msg_ctl >> MSIX_ENABLE_BIT) & 0x1 == 0x1 + } + + pub fn table_offset(&self) -> u32 { + self.table & 0xffff_fff8 + } + + pub fn pba_offset(&self) -> u32 { + self.pba & 0xffff_fff8 + } + + pub fn table_bir(&self) -> u32 { + self.table & 0x7 + } + + pub fn pba_bir(&self) -> u32 { + self.pba & 0x7 + } + + pub fn table_size(&self) -> u16 { + (self.msg_ctl & 0x7ff) + 1 + } +} diff --git a/src/pci/src/vfio.rs b/src/pci/src/vfio.rs new file mode 100644 index 00000000000..7f30cd54482 --- /dev/null +++ b/src/pci/src/vfio.rs @@ -0,0 +1,1233 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// +use crate::{ + msi_num_enabled_vectors, BarReprogrammingParams, MsiConfig, MsixCap, MsixConfig, + PciBarConfiguration, PciBarRegionType, PciCapabilityId, PciClassCode, PciConfiguration, + PciDevice, PciDeviceError, PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE, +}; + +use byteorder::{ByteOrder, LittleEndian}; +use kvm_ioctls::VmFd; +use std::any::Any; +use std::fmt::{Debug, Formatter}; +use std::os::unix::io::AsRawFd; +use std::ptr::null_mut; +use std::sync::{Arc, Barrier, Mutex}; +use std::{fmt, io, result}; +use vfio_bindings::bindings::vfio::*; +use vfio_ioctls::{VfioContainer, VfioDevice, VfioError}; + +use vm_device::interrupt::{ + InterruptIndex, InterruptManager, InterruptSourceGroup, + MsiIrqGroupConfig +}; + +use vm_memory::{Address, GuestAddress, GuestUsize}; +use vmm_sys_util::eventfd::EventFd; +use vm_system_allocator::SystemAllocator; + +pub use kvm_bindings::kvm_userspace_memory_region as MemoryRegion; + +use log::error; + +#[derive(Debug)] +pub enum VfioPciError { + AllocateGsi, + DmaMap(VfioError), + DmaUnmap(VfioError), + EnableIntx(VfioError), + EnableMsi(VfioError), + EnableMsix(VfioError), + EventFd(io::Error), + InterruptSourceGroupCreate(io::Error), + // IrqFd(hypervisor::HypervisorVmError), + MapRegionGuest(anyhow::Error), + MissingNotifier, + MsiNotConfigured, + MsixNotConfigured, + NewVfioPciDevice, +} +pub type Result = std::result::Result; + +impl fmt::Display for VfioPciError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + VfioPciError::AllocateGsi => write!(f, "failed to allocate GSI"), + VfioPciError::DmaMap(e) => write!(f, "failed to DMA map: {}", e), + VfioPciError::DmaUnmap(e) => write!(f, "failed to DMA unmap: {}", e), + VfioPciError::EnableIntx(e) => write!(f, "failed to enable INTx: {}", e), + VfioPciError::EnableMsi(e) => write!(f, "failed to enable MSI: {}", e), + VfioPciError::EnableMsix(e) => write!(f, "failed to enable MSI-X: {}", e), + VfioPciError::EventFd(e) => write!(f, "failed to create eventfd: {}", e), + VfioPciError::InterruptSourceGroupCreate(e) => { + write!(f, "failed to create interrupt source group: {}", e) + } + VfioPciError::MapRegionGuest(e) => { + write!(f, "failed to map VFIO PCI region into guest: {}", e) + } + VfioPciError::MissingNotifier => write!(f, "failed to notifier's eventfd"), + VfioPciError::MsiNotConfigured => write!(f, "MSI interrupt not yet configured"), + VfioPciError::MsixNotConfigured => write!(f, "MSI-X interrupt not yet configured"), + VfioPciError::NewVfioPciDevice => write!(f, "failed to create VFIO PCI device"), + } + } +} + +#[derive(Copy, Clone)] +enum PciVfioSubclass { + VfioSubclass = 0xff, +} + +impl PciSubclass for PciVfioSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +enum InterruptUpdateAction { + EnableMsi, + DisableMsi, + EnableMsix, + DisableMsix, +} + +struct VfioIntx { + interrupt_source_group: Arc>, + enabled: bool, +} + +struct VfioMsi { + cfg: MsiConfig, + cap_offset: u32, + interrupt_source_group: Arc>, +} + +impl VfioMsi { + fn update(&mut self, offset: u64, data: &[u8]) -> Option { + let old_enabled = self.cfg.enabled(); + + self.cfg.update(offset, data); + + let new_enabled = self.cfg.enabled(); + + if !old_enabled && new_enabled { + return Some(InterruptUpdateAction::EnableMsi); + } + + if old_enabled && !new_enabled { + return Some(InterruptUpdateAction::DisableMsi); + } + + None + } +} + +struct VfioMsix { + bar: MsixConfig, + cap: MsixCap, + cap_offset: u32, + interrupt_source_group: Arc>, +} + +impl VfioMsix { + fn update(&mut self, offset: u64, data: &[u8]) -> Option { + let old_enabled = self.bar.enabled(); + + // Update "Message Control" word + if offset == 2 && data.len() == 2 { + self.bar.set_msg_ctl(LittleEndian::read_u16(data)); + } + + let new_enabled = self.bar.enabled(); + + if !old_enabled && new_enabled { + return Some(InterruptUpdateAction::EnableMsix); + } + + if old_enabled && !new_enabled { + return Some(InterruptUpdateAction::DisableMsix); + } + + None + } + + fn table_accessed(&self, bar_index: u32, offset: u64) -> bool { + let table_offset: u64 = u64::from(self.cap.table_offset()); + let table_size: u64 = u64::from(self.cap.table_size()) * (MSIX_TABLE_ENTRY_SIZE as u64); + let table_bir: u32 = self.cap.table_bir(); + + bar_index == table_bir && offset >= table_offset && offset < table_offset + table_size + } +} + +struct Interrupt { + intx: Option, + msi: Option, + msix: Option, +} + +impl Interrupt { + fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option { + if let Some(ref mut msi) = &mut self.msi { + let action = msi.update(offset, data); + return action; + } + + None + } + + fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option { + if let Some(ref mut msix) = &mut self.msix { + let action = msix.update(offset, data); + return action; + } + + None + } + + fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { + if let Some(msi) = &self.msi { + if offset >= u64::from(msi.cap_offset) + && offset < u64::from(msi.cap_offset) + msi.cfg.size() + { + return Some(( + PciCapabilityId::MessageSignalledInterrupts, + u64::from(msi.cap_offset), + )); + } + } + + if let Some(msix) = &self.msix { + if offset == u64::from(msix.cap_offset) { + return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); + } + } + + None + } + + fn msix_table_accessed(&self, bar_index: u32, offset: u64) -> bool { + if let Some(msix) = &self.msix { + return msix.table_accessed(bar_index, offset); + } + + false + } + + fn msix_write_table(&mut self, offset: u64, data: &[u8]) { + if let Some(ref mut msix) = &mut self.msix { + let offset = offset - u64::from(msix.cap.table_offset()); + msix.bar.write_table(offset, data) + } + } + + fn msix_read_table(&self, offset: u64, data: &mut [u8]) { + if let Some(msix) = &self.msix { + let offset = offset - u64::from(msix.cap.table_offset()); + msix.bar.read_table(offset, data) + } + } + + fn intx_in_use(&self) -> bool { + if let Some(intx) = &self.intx { + return intx.enabled; + } + + false + } +} + + +#[derive(Copy, Clone)] +pub struct MmioRegion { + pub start: GuestAddress, + pub length: GuestUsize, + type_: PciBarRegionType, + index: u32, + mem_slot: Option, + pub host_addr: Option, + mmap_size: Option, +} + +struct VfioPciConfig { + device: Arc, +} + +impl VfioPciConfig { + fn new(device: Arc) -> Self { + VfioPciConfig { device } + } + + fn read_config_byte(&self, offset: u32) -> u8 { + let mut data: [u8; 1] = [0]; + self.device + .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); + + data[0] + } + + fn read_config_word(&self, offset: u32) -> u16 { + let mut data: [u8; 2] = [0, 0]; + self.device + .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); + + u16::from_le_bytes(data) + } + + fn read_config_dword(&self, offset: u32) -> u32 { + let mut data: [u8; 4] = [0, 0, 0, 0]; + self.device + .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); + + u32::from_le_bytes(data) + } + + fn write_config_dword(&self, buf: u32, offset: u32) { + let data: [u8; 4] = buf.to_le_bytes(); + self.device + .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into()) + } +} + +/// VfioPciDevice represents a VFIO PCI device. +/// This structure implements the BusDevice and PciDevice traits. +/// +/// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. +/// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, +/// which then gets added to the PCI bus. +pub struct VfioPciDevice { + vm: Arc>, + device: Arc, + container: Arc, + vfio_pci_configuration: VfioPciConfig, + configuration: PciConfiguration, + mmio_regions: Vec, + interrupt: Interrupt, + iommu_attached: bool, +} + +impl Debug for VfioPciDevice { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.debug_struct("VfioPciDevice") + .finish() + } +} + +impl VfioPciDevice { + /// Constructs a new Vfio Pci device for the given Vfio device + pub fn new( + vm: Arc>, + device: VfioDevice, + container: Arc, + msi_interrupt_manager: &Arc>, + legacy_interrupt_group: Option>>, + iommu_attached: bool, + ) -> Result { + let device = Arc::new(device); + device.reset(); + + let configuration = PciConfiguration::new( + 0, + 0, + 0, + PciClassCode::Other, + &PciVfioSubclass::VfioSubclass, + None, + PciHeaderType::Device, + 0, + 0, + None, + ); + + let vfio_pci_configuration = VfioPciConfig::new(Arc::clone(&device)); + + let mut vfio_pci_device = VfioPciDevice { + vm: Arc::clone(&vm), + device, + container, + configuration, + vfio_pci_configuration, + mmio_regions: Vec::new(), + interrupt: Interrupt { + intx: None, + msi: None, + msix: None, + }, + iommu_attached, + }; + + vfio_pci_device.parse_capabilities(msi_interrupt_manager); + + vfio_pci_device.initialize_legacy_interrupt(legacy_interrupt_group)?; + + Ok(vfio_pci_device) + } + + fn enable_intx(&mut self) -> Result<()> { + if let Some(intx) = &mut self.interrupt.intx { + if !intx.enabled { + if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { + self.device + .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) + .map_err(VfioPciError::EnableIntx)?; + + intx.enabled = true; + } else { + return Err(VfioPciError::MissingNotifier); + } + } + } + + Ok(()) + } + + fn disable_intx(&mut self) { + if let Some(intx) = &mut self.interrupt.intx { + if intx.enabled { + if let Err(e) = self.device.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { + error!("Could not disable INTx: {}", e); + } else { + intx.enabled = false; + } + } + } + } + + fn enable_msi(&self) -> Result<()> { + if let Some(msi) = &self.interrupt.msi { + let mut irq_fds: Vec = Vec::new(); + for i in 0..msi.cfg.num_enabled_vectors() { + if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { + irq_fds.push(eventfd); + } else { + return Err(VfioPciError::MissingNotifier); + } + } + + self.device + .enable_msi(irq_fds.iter().collect()) + .map_err(VfioPciError::EnableMsi)?; + } + + Ok(()) + } + + fn disable_msi(&self) { + if let Err(e) = self.device.disable_msi() { + error!("Could not disable MSI: {}", e); + } + } + + fn enable_msix(&self) -> Result<()> { + if let Some(msix) = &self.interrupt.msix { + let mut irq_fds: Vec = Vec::new(); + for i in 0..msix.bar.table_entries.len() { + if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { + irq_fds.push(eventfd); + } else { + return Err(VfioPciError::MissingNotifier); + } + } + + self.device + .enable_msix(irq_fds.iter().collect()) + .map_err(VfioPciError::EnableMsix)?; + } + + Ok(()) + } + + fn disable_msix(&self) { + if let Err(e) = self.device.disable_msix() { + error!("Could not disable MSI-X: {}", e); + } + } + + fn initialize_legacy_interrupt( + &mut self, + legacy_interrupt_group: Option>>, + ) -> Result<()> { + if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { + if irq_info.count == 0 { + error!("Device does not want legacy IRQ"); + // A count of 0 means the INTx IRQ is not supported, therefore + // it shouldn't be initialized. + return Ok(()); + } + } + if let Some(interrupt_source_group) = legacy_interrupt_group { + self.interrupt.intx = Some( + VfioIntx { + interrupt_source_group, + enabled: false, + }); + } + + self.enable_intx()?; + + Ok(()) + } + + fn parse_msix_capabilities( + &mut self, + cap: u8, + interrupt_manager: &Arc>, + ) { + let msg_ctl = self + .vfio_pci_configuration + .read_config_word((cap + 2).into()); + + let table = self + .vfio_pci_configuration + .read_config_dword((cap + 4).into()); + + let pba = self + .vfio_pci_configuration + .read_config_dword((cap + 8).into()); + + let msix_cap = MsixCap { + msg_ctl, + table, + pba, + }; + + let interrupt_source_group = interrupt_manager + .create_group(MsiIrqGroupConfig { + base: 0, + count: msix_cap.table_size() as InterruptIndex, + }) + .unwrap(); + + let msix_config = MsixConfig::new(msix_cap.table_size(), interrupt_source_group.clone(), 0); + + self.interrupt.msix = Some(VfioMsix { + bar: msix_config, + cap: msix_cap, + cap_offset: cap.into(), + interrupt_source_group, + }); + } + + fn parse_msi_capabilities( + &mut self, + cap: u8, + interrupt_manager: &Arc>, + ) { + let msg_ctl = self + .vfio_pci_configuration + .read_config_word((cap + 2).into()); + + let interrupt_source_group = interrupt_manager + .create_group(MsiIrqGroupConfig { + base: 0, + count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, + }) + .unwrap(); + + let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone()); + + self.interrupt.msi = Some(VfioMsi { + cfg: msi_config, + cap_offset: cap.into(), + interrupt_source_group, + }); + } + + fn parse_capabilities( + &mut self, + interrupt_manager: &Arc>, + ) { + let mut cap_next = self + .vfio_pci_configuration + .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); + + while cap_next != 0 { + let cap_id = self + .vfio_pci_configuration + .read_config_byte(cap_next.into()); + + match PciCapabilityId::from(cap_id) { + PciCapabilityId::MessageSignalledInterrupts => { + if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { + if irq_info.count > 0 { + // Parse capability only if the VFIO device + // supports MSI. + self.parse_msi_capabilities(cap_next, interrupt_manager); + } + } + } + PciCapabilityId::MsiX => { + if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) { + if irq_info.count > 0 { + // Parse capability only if the VFIO device + // supports MSI-X. + self.parse_msix_capabilities(cap_next, interrupt_manager); + } + } + } + _ => {} + }; + + cap_next = self + .vfio_pci_configuration + .read_config_byte((cap_next + 1).into()); + } + } + + fn update_msi_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<()> { + match self.interrupt.update_msi(offset, data) { + Some(InterruptUpdateAction::EnableMsi) => { + // Disable INTx before we can enable MSI + self.disable_intx(); + self.enable_msi()?; + } + Some(InterruptUpdateAction::DisableMsi) => { + // Fallback onto INTx when disabling MSI + self.disable_msi(); + self.enable_intx()?; + } + _ => {} + } + + Ok(()) + } + + fn update_msix_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<()> { + match self.interrupt.update_msix(offset, data) { + Some(InterruptUpdateAction::EnableMsix) => { + // Disable INTx before we can enable MSI-X + self.disable_intx(); + self.enable_msix()?; + + error!("MSIX enabled.") + } + Some(InterruptUpdateAction::DisableMsix) => { + // Fallback onto INTx when disabling MSI-X + self.disable_msix(); + self.enable_intx()?; + } + _ => {} + } + + Ok(()) + } + + fn find_region(&self, addr: u64) -> Option { + for region in self.mmio_regions.iter() { + // error!("Finding region {:x} vs {:x} len {:x}", addr, region.start.raw_value(), region.length); + if addr >= region.start.raw_value() + && addr < region.start.unchecked_add(region.length).raw_value() + { + return Some(*region); + } + } + None + } + + fn make_user_memory_region( + slot: u32, + guest_phys_addr: u64, + memory_size: u64, + userspace_addr: u64, + readonly: bool, + log_dirty_pages: bool, + ) -> MemoryRegion { + use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY}; + MemoryRegion { + slot, + guest_phys_addr, + memory_size, + userspace_addr, + flags: if readonly { KVM_MEM_READONLY } else { 0 } + | if log_dirty_pages { + KVM_MEM_LOG_DIRTY_PAGES + } else { + 0 + }, + } + } + /// Map MMIO regions into the guest, and avoid VM exits when the guest tries + /// to reach those regions. + /// + /// # Arguments + /// + /// * `vm` - The VM object. It is used to set the VFIO MMIO regions + /// as user memory regions. + /// * `mem_slot` - The closure to return a memory slot. + pub fn map_mmio_regions(&mut self) -> Result<()> { + let fd = self.device.as_raw_fd(); + let mut slot = 2; + + error!("Mmap mmio regions count {}", self.mmio_regions.len()); + for region in self.mmio_regions.iter_mut() { + // We want to skip the mapping of the BAR containing the MSI-X + // table even if it is mappable. The reason is we need to trap + // any access to the MSI-X table and update the GSI routing + // accordingly. + if let Some(msix) = &self.interrupt.msix { + if region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir() { + continue; + } + } + let region_flags = self.device.get_region_flags(region.index); + if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { + let mut prot = 0; + if region_flags & VFIO_REGION_INFO_FLAG_READ != 0 { + prot |= libc::PROT_READ; + } + if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { + prot |= libc::PROT_WRITE; + } + let mmap_offset = self.device.get_region_offset(region.index); + let mmap_size = self.device.get_region_size(region.index); + + let offset = self.device.get_region_offset(region.index) + mmap_offset; + error!( + "VFIO region {}, offset {:x}, size {:x}", + region.index, offset, mmap_size + ); + let host_addr = unsafe { + libc::mmap( + null_mut(), + mmap_size as usize, + prot, + libc::MAP_SHARED, + fd, + offset as libc::off_t, + ) + }; + + if host_addr == libc::MAP_FAILED { + error!( + "Could not mmap regions, error:{}", + io::Error::last_os_error() + ); + continue; + } + + error!( + "Mmap slot {} gpa {:x} size {} hva {:x}", + slot, + region.start.raw_value() + mmap_offset, + mmap_size as u64, + host_addr as u64 + ); + + let mem_region = Self::make_user_memory_region( + slot, + region.start.raw_value() + mmap_offset, + mmap_size as u64, + host_addr as u64, + false, + false, + ); + + unsafe { + self.vm.lock().expect("Poisoned lock") + .set_user_memory_region(mem_region) + .map_err(|e| VfioPciError::MapRegionGuest(e.into()))?; + } + + // self.container.vfio_dma_map( + // region.start.raw_value() + mmap_offset, + // mmap_size, + // host_addr as u64 + // ).unwrap(); + + // Update the region with memory mapped info. + region.mem_slot = Some(slot); + region.host_addr = Some(host_addr as u64); + region.mmap_size = Some(mmap_size as usize); + + slot += 1; + } + } + + Ok(()) + } + + pub fn unmap_mmio_regions(&mut self) { + for region in self.mmio_regions.iter() { + if let (Some(host_addr), Some(mmap_size), Some(mem_slot)) = + (region.host_addr, region.mmap_size, region.mem_slot) + { + let mmap_offset = self.device.get_region_offset(region.index); + + // Remove region + let r = Self::make_user_memory_region( + mem_slot, + region.start.raw_value() + mmap_offset, + 0, + host_addr as u64, + false, + false, + ); + + if let Err(e) = unsafe { self.vm.lock().expect("Poisoned lock").set_user_memory_region(r) } { + error!("Could not remove the userspace memory region: {}", e); + } + + let ret = unsafe { libc::munmap(host_addr as *mut libc::c_void, mmap_size) }; + if ret != 0 { + error!( + "Could not unmap region {}, error:{}", + region.index, + io::Error::last_os_error() + ); + } + } + } + } + + pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<()> { + if !self.iommu_attached { + self.container + .vfio_dma_map(iova, size, user_addr) + .map_err(VfioPciError::DmaMap)?; + } + Ok(()) + } + + pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<()> { + if !self.iommu_attached { + self.container + .vfio_dma_unmap(iova, size) + .map_err(VfioPciError::DmaUnmap)?; + } + + Ok(()) + } + + pub fn mmio_regions(&self) -> Vec { + self.mmio_regions.clone() + } + + pub fn bus_read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data); + } + + pub fn bus_write(&mut self, base: u64, offset: u64, data: &[u8]) { + self.write_bar(base, offset, data); + } +} + +impl Drop for VfioPciDevice { + fn drop(&mut self) { + self.unmap_mmio_regions(); + + if self.interrupt.intx_in_use() { + self.disable_intx(); + } + } +} + + +// First BAR offset in the PCI config space. +const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; +// Capability register offset in the PCI config space. +const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; +// IO BAR when first BAR bit is 1. +const PCI_CONFIG_IO_BAR: u32 = 0x1; +// Memory BAR flags (lower 4 bits). +const PCI_CONFIG_MEMORY_BAR_FLAG_MASK: u32 = 0xf; +// 64-bit memory bar flag. +const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; +// PCI config register size (4 bytes). +const PCI_CONFIG_REGISTER_SIZE: usize = 4; +// Number of BARs for a PCI device +const BAR_NUMS: usize = 6; +// PCI Header Type register index +const PCI_HEADER_TYPE_REG_INDEX: usize = 3; +// First BAR register index +const PCI_CONFIG_BAR0_INDEX: usize = 4; +// PCI ROM expansion BAR register index +const PCI_ROM_EXP_BAR_INDEX: usize = 12; + +impl PciDevice for VfioPciDevice { + fn allocate_bars( + &mut self, + allocator: &mut SystemAllocator, + ) -> std::result::Result, PciDeviceError> + { + let mut ranges = Vec::new(); + let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX as u32; + + // Going through all regular regions to compute the BAR size. + // We're not saving the BAR address to restore it, because we + // are going to allocate a guest address for each BAR and write + // that new address back. + while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { + let mut lsb_size: u32 = 0xffff_ffff; + let mut msb_size = 0; + let mut region_size: u64; + let bar_addr: GuestAddress; + + // Read the BAR size (Starts by all 1s to the BAR) + let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { + (PCI_ROM_EXP_BAR_INDEX * 4) as u32 + } else { + PCI_CONFIG_BAR_OFFSET + bar_id * 4 + }; + + self.vfio_pci_configuration + .write_config_dword(lsb_size, bar_offset); + lsb_size = self.vfio_pci_configuration.read_config_dword(bar_offset); + + // We've just read the BAR size back. Or at least its LSB. + let lsb_flag = lsb_size & PCI_CONFIG_MEMORY_BAR_FLAG_MASK; + + if lsb_size == 0 { + bar_id += 1; + continue; + } + + // Is this an IO BAR? + let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { + matches!(lsb_flag & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) + } else { + false + }; + + // Is this a 64-bit BAR? + let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { + matches!( + lsb_flag & PCI_CONFIG_MEMORY_BAR_64BIT, + PCI_CONFIG_MEMORY_BAR_64BIT + ) + } else { + false + }; + + // By default, the region type is 32 bits memory BAR. + let mut region_type = PciBarRegionType::Memory32BitRegion; + + if io_bar { + #[cfg(target_arch = "x86_64")] + { + // IO BAR + region_type = PciBarRegionType::IoRegion; + + // Clear first bit. + lsb_size &= 0xffff_fffc; + + // Find the first bit that's set to 1. + let first_bit = lsb_size.trailing_zeros(); + region_size = 2u64.pow(first_bit); + // We need to allocate a guest PIO address range for that BAR. + // The address needs to be 4 bytes aligned. + bar_addr = allocator + .allocate_io_addresses(None, region_size, Some(0x4)) + .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; + } + #[cfg(target_arch = "aarch64")] + unimplemented!() + } else { + if is_64bit_bar { + // 64 bits Memory BAR + region_type = PciBarRegionType::Memory64BitRegion; + + msb_size = 0xffff_ffff; + let msb_bar_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; + + self.vfio_pci_configuration + .write_config_dword(msb_size, msb_bar_offset); + + msb_size = self + .vfio_pci_configuration + .read_config_dword(msb_bar_offset); + } + + // Clear the first four bytes from our LSB. + lsb_size &= 0xffff_fff0; + + region_size = u64::from(msb_size); + region_size <<= 32; + region_size |= u64::from(lsb_size); + + // Find the first that's set to 1. + let first_bit = region_size.trailing_zeros(); + region_size = 2u64.pow(first_bit); + + // We need to allocate a guest MMIO address range for that BAR. + // In case the BAR is mappable directly, this means it might be + // set as user memory region, which expects to deal with 4K + // pages. Therefore, the alignment has to be set accordingly. + let bar_alignment = if (bar_id == VFIO_PCI_ROM_REGION_INDEX) + || (self.device.get_region_flags(bar_id) & VFIO_REGION_INFO_FLAG_MMAP != 0) + { + // 4K alignment + 0x1000 + } else { + // Default 16 bytes alignment + 0x10 + }; + if is_64bit_bar { + bar_addr = allocator + .allocate_mmio_addresses(None, region_size, Some(bar_alignment)) + .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; + } else { + bar_addr = allocator + .allocate_mmio_hole_addresses(None, region_size, Some(bar_alignment)) + .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; + } + } + + let reg_idx = if bar_id == VFIO_PCI_ROM_REGION_INDEX { + PCI_ROM_EXP_BAR_INDEX + } else { + bar_id as usize + }; + + // We can now build our BAR configuration block. + let config = PciBarConfiguration::default() + .set_register_index(reg_idx) + .set_address(bar_addr.raw_value()) + .set_size(region_size) + .set_region_type(region_type); + + if bar_id == VFIO_PCI_ROM_REGION_INDEX { + self.configuration + .add_pci_rom_bar(&config, lsb_flag & 0x1) + .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; + } else { + self.configuration + .add_pci_bar(&config) + .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; + } + + error!("Bar addr: {:?}", bar_addr); + ranges.push((bar_addr, region_size, region_type)); + self.mmio_regions.push(MmioRegion { + start: bar_addr, + length: region_size, + type_: region_type, + index: bar_id as u32, + mem_slot: None, + host_addr: None, + mmap_size: None, + }); + + + bar_id += 1; + if is_64bit_bar { + bar_id += 1; + } + } + + Ok(ranges) + } + + fn free_bars( + &mut self, + allocator: &mut SystemAllocator, + ) -> std::result::Result<(), PciDeviceError> { + for region in self.mmio_regions.iter() { + match region.type_ { + PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] + allocator.free_io_addresses(region.start, region.length); + #[cfg(target_arch = "aarch64")] + error!("I/O region is not supported"); + } + PciBarRegionType::Memory32BitRegion => { + allocator.free_mmio_hole_addresses(region.start, region.length); + } + PciBarRegionType::Memory64BitRegion => { + allocator.free_mmio_addresses(region.start, region.length); + } + } + } + Ok(()) + } + + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + // When the guest wants to write to a BAR, we trap it into + // our local configuration space. We're not reprogramming + // VFIO device. + if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) + || reg_idx == PCI_ROM_EXP_BAR_INDEX + { + // We keep our local cache updated with the BARs. + // We'll read it back from there when the guest is asking + // for BARs (see read_config_register()). + self.configuration + .write_config_register(reg_idx, offset, data); + return None; + } + + let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; + + // If the MSI or MSI-X capabilities are accessed, we need to + // update our local cache accordingly. + // Depending on how the capabilities are modified, this could + // trigger a VFIO MSI or MSI-X toggle. + if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { + let cap_offset: u64 = reg - cap_base + offset; + match cap_id { + PciCapabilityId::MessageSignalledInterrupts => { + if let Err(e) = self.update_msi_capabilities(cap_offset, data) { + error!("Could not update MSI capabilities: {}", e); + } + } + PciCapabilityId::MsiX => { + if let Err(e) = self.update_msix_capabilities(cap_offset, data) { + error!("Could not update MSI-X capabilities: {}", e); + } + } + _ => {} + } + } + + // Make sure to write to the device's PCI config space after MSI/MSI-X + // interrupts have been enabled/disabled. In case of MSI, when the + // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), + // the MSI Enable bit in the MSI capability structure found in the PCI + // config space is disabled by default. That's why when the guest is + // enabling this bit, we first need to enable the MSI interrupts with + // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write + // to the device region to update the MSI Enable bit. + self.device + .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, reg + offset); + + None + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + // When reading the BARs, we trap it and return what comes + // from our local configuration space. We want the guest to + // use that and not the VFIO device BARs as it does not map + // with the guest address space. + if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) + || reg_idx == PCI_ROM_EXP_BAR_INDEX + { + return self.configuration.read_reg(reg_idx); + } + + // Since we don't support passing multi-functions devices, we should + // mask the multi-function bit, bit 7 of the Header Type byte on the + // register 3. + let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { + 0xff7f_ffffu32 + } else { + 0xffff_ffffu32 + }; + + // The config register read comes from the VFIO device itself. + self.vfio_pci_configuration + .read_config_dword((reg_idx * 4) as u32) + & mask + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.configuration.detect_bar_reprogramming(reg_idx, data) + } + + fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { + let addr = base + offset; + if let Some(region) = self.find_region(addr) { + let offset = addr - region.start.raw_value(); + + if self.interrupt.msix_table_accessed(region.index, offset) { + self.interrupt.msix_read_table(offset, data); + } else { + self.device.region_read(region.index, data, offset); + } + } + + // INTx EOI + // The guest reading from the BAR potentially means the interrupt has + // been received and can be acknowledged. + if self.interrupt.intx_in_use() { + if let Err(e) = self.device.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { + error!("Failed unmasking INTx IRQ: {}", e); + } + } + } + + fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + let addr = base + offset; + if let Some(region) = self.find_region(addr) { + let offset = addr - region.start.raw_value(); + + // If the MSI-X table is written to, we need to update our cache. + if self.interrupt.msix_table_accessed(region.index, offset) { + self.interrupt.msix_write_table(offset, data); + } else { + self.device.region_write(region.index, data, offset); + } + } + + // INTx EOI + // The guest writing to the BAR potentially means the interrupt has + // been received and can be acknowledged. + if self.interrupt.intx_in_use() { + if let Err(e) = self.device.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { + error!("Failed unmasking INTx IRQ: {}", e); + } + } + + None + } + + fn move_bar(&mut self, old_base: u64, new_base: u64) -> result::Result<(), io::Error> { + for region in self.mmio_regions.iter_mut() { + if region.start.raw_value() == old_base { + region.start = GuestAddress(new_base); + + if let Some(mem_slot) = region.mem_slot { + if let Some(host_addr) = region.host_addr { + let mmap_offset = self.device.get_region_offset(region.index); + let mmap_size = self.device.get_region_size(region.index); + + // Remove old region + let old_mem_region = Self::make_user_memory_region( + mem_slot, + old_base + mmap_offset, + 0, + host_addr as u64, + false, + false, + ); + + unsafe { self.vm.lock().expect("Poisoned lock") + .set_user_memory_region(old_mem_region) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + } + + // Insert new region + let new_mem_region = Self::make_user_memory_region( + mem_slot, + new_base + mmap_offset, + mmap_size as u64, + host_addr as u64, + false, + false, + ); + + unsafe { self.vm.lock().expect("Poisoned lock") + .set_user_memory_region(new_mem_region) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + } + } + } + } + } + + Ok(()) + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } +} \ No newline at end of file diff --git a/src/vm-device/Cargo.toml b/src/vm-device/Cargo.toml new file mode 100644 index 00000000000..7c9d1864b7a --- /dev/null +++ b/src/vm-device/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "vm-device" +version = "0.1.0" +authors = ["The Cloud Hypervisor Authors"] +edition = "2018" + +[dependencies] +anyhow = "1.0" +thiserror = "1.0" +serde = {version = ">=1.0.27", features = ["rc"] } +serde_derive = ">=1.0.27" +serde_json = ">=1.0.9" +vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", branch = "main" } +vm-memory = { version = "0.15.0", features = ["backend-mmap"] } +vmm-sys-util = ">=0.3.1" + diff --git a/src/vm-device/src/bus.rs b/src/vm-device/src/bus.rs new file mode 100644 index 00000000000..3388ee20514 --- /dev/null +++ b/src/vm-device/src/bus.rs @@ -0,0 +1,362 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. + +//! Handles routing to devices in an address space. + +use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; +use std::collections::btree_map::BTreeMap; +use std::sync::{Arc, Barrier, Mutex, RwLock, Weak}; +use std::{convert, error, fmt, io, result}; + +/// Trait for devices that respond to reads or writes in an arbitrary address space. +/// +/// The device does not care where it exists in address space as each method is only given an offset +/// into its allocated portion of address space. +#[allow(unused_variables)] +pub trait BusDevice: Send { + /// Reads at `offset` from this device + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {} + /// Writes at `offset` into this device + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + None + } + /// Triggers the `irq_mask` interrupt on this device + fn interrupt(&self, irq_mask: u32) {} +} + +#[derive(Debug)] +pub enum Error { + /// The insertion failed because the new device overlapped with an old device. + Overlap, + /// Failed to operate on zero sized range. + ZeroSizedRange, + /// Failed to find address range. + MissingAddressRange, +} + +pub type Result = result::Result; + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "bus_error: {:?}", self) + } +} + +impl error::Error for Error {} + +impl convert::From for io::Error { + fn from(e: Error) -> Self { + io::Error::new(io::ErrorKind::Other, e) + } +} + +/// Holds a base and length representing the address space occupied by a `BusDevice`. +/// +/// * base - The address at which the range start. +/// * len - The length of the range in bytes. +#[derive(Debug, Copy, Clone)] +pub struct BusRange { + pub base: u64, + pub len: u64, +} + +impl BusRange { + /// Returns true if there is overlap with the given range. + pub fn overlaps(&self, base: u64, len: u64) -> bool { + self.base < (base + len) && base < self.base + self.len + } +} + +impl Eq for BusRange {} + +impl PartialEq for BusRange { + fn eq(&self, other: &BusRange) -> bool { + self.base == other.base + } +} + +impl Ord for BusRange { + fn cmp(&self, other: &BusRange) -> Ordering { + self.base.cmp(&other.base) + } +} + +impl PartialOrd for BusRange { + fn partial_cmp(&self, other: &BusRange) -> Option { + self.base.partial_cmp(&other.base) + } +} + +/// A device container for routing reads and writes over some address space. +/// +/// This doesn't have any restrictions on what kind of device or address space this applies to. The +/// only restriction is that no two devices can overlap in this address space. +#[derive(Default)] +pub struct Bus { + devices: RwLock>>>, +} + +impl Bus { + /// Constructs an a bus with an empty address space. + pub fn new() -> Bus { + Bus { + devices: RwLock::new(BTreeMap::new()), + } + } + + fn first_before(&self, addr: u64) -> Option<(BusRange, Arc>)> { + let devices = self.devices.read().unwrap(); + let (range, dev) = devices + .range(..=BusRange { base: addr, len: 1 }) + .rev() + .next()?; + dev.upgrade().map(|d| (*range, d.clone())) + } + + #[allow(clippy::type_complexity)] + pub fn resolve(&self, addr: u64) -> Option<(u64, u64, Arc>)> { + if let Some((range, dev)) = self.first_before(addr) { + let offset = addr - range.base; + if offset < range.len { + return Some((range.base, offset, dev)); + } + } + None + } + + /// Puts the given device at the given address space. + pub fn insert(&self, device: Arc>, base: u64, len: u64) -> Result<()> { + if len == 0 { + return Err(Error::ZeroSizedRange); + } + + // Reject all cases where the new device's range overlaps with an existing device. + if self + .devices + .read() + .unwrap() + .iter() + .any(|(range, _dev)| range.overlaps(base, len)) + { + return Err(Error::Overlap); + } + + if self + .devices + .write() + .unwrap() + .insert(BusRange { base, len }, Arc::downgrade(&device)) + .is_some() + { + return Err(Error::Overlap); + } + + Ok(()) + } + + /// Removes the device at the given address space range. + pub fn remove(&self, base: u64, len: u64) -> Result<()> { + if len == 0 { + return Err(Error::ZeroSizedRange); + } + + let bus_range = BusRange { base, len }; + + if self.devices.write().unwrap().remove(&bus_range).is_none() { + return Err(Error::MissingAddressRange); + } + + Ok(()) + } + + /// Removes all entries referencing the given device. + pub fn remove_by_device(&self, device: &Arc>) -> Result<()> { + let mut device_list = self.devices.write().unwrap(); + let mut remove_key_list = Vec::new(); + + for (key, value) in device_list.iter() { + if Arc::ptr_eq(&value.upgrade().unwrap(), device) { + remove_key_list.push(*key); + } + } + + for key in remove_key_list.iter() { + device_list.remove(key); + } + + Ok(()) + } + + /// Updates the address range for an existing device. + pub fn update_range( + &self, + old_base: u64, + old_len: u64, + new_base: u64, + new_len: u64, + ) -> Result<()> { + // Retrieve the device corresponding to the range + let device = if let Some((_, _, dev)) = self.resolve(old_base) { + dev.clone() + } else { + return Err(Error::MissingAddressRange); + }; + + // Remove the old address range + self.remove(old_base, old_len)?; + + // Insert the new address range + self.insert(device, new_base, new_len) + } + + /// Reads data from the device that owns the range containing `addr` and puts it into `data`. + /// + /// Returns true on success, otherwise `data` is untouched. + pub fn read(&self, addr: u64, data: &mut [u8]) -> Result<()> { + if let Some((base, offset, dev)) = self.resolve(addr) { + // OK to unwrap as lock() failing is a serious error condition and should panic. + dev.lock() + .expect("Failed to acquire device lock") + .read(base, offset, data); + Ok(()) + } else { + Err(Error::MissingAddressRange) + } + } + + /// Writes `data` to the device that owns the range containing `addr`. + /// + /// Returns true on success, otherwise `data` is untouched. + pub fn write(&self, addr: u64, data: &[u8]) -> Result>> { + if let Some((base, offset, dev)) = self.resolve(addr) { + // OK to unwrap as lock() failing is a serious error condition and should panic. + Ok(dev + .lock() + .expect("Failed to acquire device lock") + .write(base, offset, data)) + } else { + Err(Error::MissingAddressRange) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + struct DummyDevice; + impl BusDevice for DummyDevice {} + + struct ConstantDevice; + impl BusDevice for ConstantDevice { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + for (i, v) in data.iter_mut().enumerate() { + *v = (offset as u8) + (i as u8); + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + for (i, v) in data.iter().enumerate() { + assert_eq!(*v, (offset as u8) + (i as u8)) + } + + None + } + } + + #[test] + fn bus_insert() { + let bus = Bus::new(); + let dummy = Arc::new(Mutex::new(DummyDevice)); + assert!(bus.insert(dummy.clone(), 0x10, 0).is_err()); + assert!(bus.insert(dummy.clone(), 0x10, 0x10).is_ok()); + + let result = bus.insert(dummy.clone(), 0x0f, 0x10); + assert!(result.is_err()); + assert_eq!(format!("{:?}", result), "Err(Overlap)"); + + assert!(bus.insert(dummy.clone(), 0x10, 0x10).is_err()); + assert!(bus.insert(dummy.clone(), 0x10, 0x15).is_err()); + assert!(bus.insert(dummy.clone(), 0x12, 0x15).is_err()); + assert!(bus.insert(dummy.clone(), 0x12, 0x01).is_err()); + assert!(bus.insert(dummy.clone(), 0x0, 0x20).is_err()); + assert!(bus.insert(dummy.clone(), 0x20, 0x05).is_ok()); + assert!(bus.insert(dummy.clone(), 0x25, 0x05).is_ok()); + assert!(bus.insert(dummy, 0x0, 0x10).is_ok()); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn bus_read_write() { + let bus = Bus::new(); + let dummy = Arc::new(Mutex::new(DummyDevice)); + assert!(bus.insert(dummy.clone(), 0x10, 0x10).is_ok()); + assert!(bus.read(0x10, &mut [0, 0, 0, 0]).is_ok()); + assert!(bus.write(0x10, &[0, 0, 0, 0]).is_ok()); + assert!(bus.read(0x11, &mut [0, 0, 0, 0]).is_ok()); + assert!(bus.write(0x11, &[0, 0, 0, 0]).is_ok()); + assert!(bus.read(0x16, &mut [0, 0, 0, 0]).is_ok()); + assert!(bus.write(0x16, &[0, 0, 0, 0]).is_ok()); + assert!(bus.read(0x20, &mut [0, 0, 0, 0]).is_err()); + assert!(bus.write(0x20, &[0, 0, 0, 0]).is_err()); + assert!(bus.read(0x06, &mut [0, 0, 0, 0]).is_err()); + assert!(bus.write(0x06, &[0, 0, 0, 0]).is_err()); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn bus_read_write_values() { + let bus = Bus::new(); + let dummy = Arc::new(Mutex::new(ConstantDevice)); + assert!(bus.insert(dummy.clone(), 0x10, 0x10).is_ok()); + + let mut values = [0, 1, 2, 3]; + assert!(bus.read(0x10, &mut values).is_ok()); + assert_eq!(values, [0, 1, 2, 3]); + assert!(bus.write(0x10, &values).is_ok()); + assert!(bus.read(0x15, &mut values).is_ok()); + assert_eq!(values, [5, 6, 7, 8]); + assert!(bus.write(0x15, &values).is_ok()); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn busrange_cmp() { + let range = BusRange { base: 0x10, len: 2 }; + assert_eq!(range, BusRange { base: 0x10, len: 3 }); + assert_eq!(range, BusRange { base: 0x10, len: 2 }); + + assert!(range < BusRange { base: 0x12, len: 1 }); + assert!(range < BusRange { base: 0x12, len: 3 }); + + assert_eq!(range, range.clone()); + + let bus = Bus::new(); + let mut data = [1, 2, 3, 4]; + let device = Arc::new(Mutex::new(DummyDevice)); + assert!(bus.insert(device.clone(), 0x10, 0x10).is_ok()); + assert!(bus.write(0x10, &data).is_ok()); + assert!(bus.read(0x10, &mut data).is_ok()); + assert_eq!(data, [1, 2, 3, 4]); + } + + #[test] + fn bus_range_overlap() { + let a = BusRange { + base: 0x1000, + len: 0x400, + }; + assert!(a.overlaps(0x1000, 0x400)); + assert!(a.overlaps(0xf00, 0x400)); + assert!(a.overlaps(0x1000, 0x01)); + assert!(a.overlaps(0xfff, 0x02)); + assert!(a.overlaps(0x1100, 0x100)); + assert!(a.overlaps(0x13ff, 0x100)); + assert!(!a.overlaps(0x1400, 0x100)); + assert!(!a.overlaps(0xf00, 0x100)); + } +} diff --git a/src/vm-device/src/dma_mapping/mod.rs b/src/vm-device/src/dma_mapping/mod.rs new file mode 100644 index 00000000000..62b5ceb1ced --- /dev/null +++ b/src/vm-device/src/dma_mapping/mod.rs @@ -0,0 +1,17 @@ +// Copyright © 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +pub mod vfio; + +/// Trait meant for triggering the DMA mapping update related to an external +/// device not managed fully through virtio. It is dedicated to virtio-iommu +/// in order to trigger the map update anytime the mapping is updated from the +/// guest. +pub trait ExternalDmaMapping: Send + Sync { + /// Map a memory range + fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), std::io::Error>; + + /// Unmap a memory range + fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), std::io::Error>; +} diff --git a/src/vm-device/src/dma_mapping/vfio.rs b/src/vm-device/src/dma_mapping/vfio.rs new file mode 100644 index 00000000000..5ed7f516887 --- /dev/null +++ b/src/vm-device/src/dma_mapping/vfio.rs @@ -0,0 +1,73 @@ +// Copyright © 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +use crate::dma_mapping::ExternalDmaMapping; +use std::io; +use std::sync::Arc; +use vfio_ioctls::VfioContainer; +use vm_memory::{GuestAddress, GuestAddressSpace, GuestMemory}; + +/// This structure implements the ExternalDmaMapping trait. It is meant to +/// be used when the caller tries to provide a way to update the mappings +/// associated with a specific VFIO container. +pub struct VfioDmaMapping { + container: Arc, + memory: Arc, +} + +impl VfioDmaMapping { + /// Create a DmaMapping object. + /// + /// # Parameters + /// * `container`: VFIO container object. + /// * `memory·: guest memory to mmap. + pub fn new(container: Arc, memory: Arc) -> Self { + VfioDmaMapping { container, memory } + } +} + +impl ExternalDmaMapping for VfioDmaMapping { + fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> { + let mem = self.memory.memory(); + let guest_addr = GuestAddress(gpa); + let user_addr = if mem.check_range(guest_addr, size as usize) { + mem.get_host_address(guest_addr).unwrap() as u64 + } else { + return Err(io::Error::new( + io::ErrorKind::Other, + format!( + "failed to convert guest address 0x{:x} into \ + host user virtual address", + gpa + ), + )); + }; + + self.container + .vfio_dma_map(iova, size, user_addr) + .map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!( + "failed to map memory for VFIO container, \ + iova 0x{:x}, gpa 0x{:x}, size 0x{:x}: {:?}", + iova, gpa, size, e + ), + ) + }) + } + + fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> { + self.container.vfio_dma_unmap(iova, size).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!( + "failed to unmap memory for VFIO container, \ + iova 0x{:x}, size 0x{:x}: {:?}", + iova, size, e + ), + ) + }) + } +} diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs new file mode 100644 index 00000000000..02fb028743c --- /dev/null +++ b/src/vm-device/src/interrupt/mod.rs @@ -0,0 +1,197 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +//! Traits and Structs to manage interrupt sources for devices. +//! +//! In system programming, an interrupt is a signal to the processor emitted by hardware or +//! software indicating an event that needs immediate attention. An interrupt alerts the processor +//! to a high-priority condition requiring the interruption of the current code the processor is +//! executing. The processor responds by suspending its current activities, saving its state, and +//! executing a function called an interrupt handler (or an interrupt service routine, ISR) to deal +//! with the event. This interruption is temporary, and, after the interrupt handler finishes, +//! unless handling the interrupt has emitted a fatal error, the processor resumes normal +//! activities. +//! +//! Hardware interrupts are used by devices to communicate that they require attention from the +//! operating system, or a bare-metal program running on the CPU if there are no OSes. The act of +//! initiating a hardware interrupt is referred to as an interrupt request (IRQ). Different devices +//! are usually associated with different interrupts using a unique value associated with each +//! interrupt. This makes it possible to know which hardware device caused which interrupts. +//! These interrupt values are often called IRQ lines, or just interrupt lines. +//! +//! Nowadays, IRQ lines is not the only mechanism to deliver device interrupts to processors. +//! MSI [(Message Signaled Interrupt)](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) +//! is another commonly used alternative in-band method of signaling an interrupt, using special +//! in-band messages to replace traditional out-of-band assertion of dedicated interrupt lines. +//! While more complex to implement in a device, message signaled interrupts have some significant +//! advantages over pin-based out-of-band interrupt signaling. Message signaled interrupts are +//! supported in PCI bus since its version 2.2, and in later available PCI Express bus. Some +//! non-PCI architectures also use message signaled interrupts. +//! +//! While IRQ is a term commonly used by Operating Systems when dealing with hardware +//! interrupts, the IRQ numbers managed by OSes are independent of the ones managed by VMM. +//! For simplicity sake, the term `Interrupt Source` is used instead of IRQ to represent both +//! pin-based interrupts and MSI interrupts. +//! +//! A device may support multiple types of interrupts, and each type of interrupt may support one +//! or multiple interrupt sources. For example, a PCI device may support: +//! * Legacy Irq: exactly one interrupt source. +//! * PCI MSI Irq: 1,2,4,8,16,32 interrupt sources. +//! * PCI MSIx Irq: 2^n(n=0-11) interrupt sources. +//! +//! A distinct Interrupt Source Identifier (ISID) will be assigned to each interrupt source. +//! An ID allocator will be used to allocate and free Interrupt Source Identifiers for devices. +//! To decouple the vm-device crate from the ID allocator, the vm-device crate doesn't take the +//! responsibility to allocate/free Interrupt Source IDs but only makes use of assigned IDs. +//! +//! The overall flow to deal with interrupts is: +//! * The VMM creates an interrupt manager +//! * The VMM creates a device manager, passing on an reference to the interrupt manager +//! * The device manager passes on an reference to the interrupt manager to all registered devices +//! * The guest kernel loads drivers for virtual devices +//! * The guest device driver determines the type and number of interrupts needed, and update the +//! device configuration +//! * The virtual device backend requests the interrupt manager to create an interrupt group +//! according to guest configuration information + +use std::sync::Arc; +use vmm_sys_util::eventfd::EventFd; + +/// Reuse std::io::Result to simplify interoperability among crates. +pub type Result = std::io::Result; + +/// Data type to store an interrupt source identifier. +pub type InterruptIndex = u32; + +/// Configuration data for legacy interrupts. +/// +/// On x86 platforms, legacy interrupts means those interrupts routed through PICs or IOAPICs. +#[derive(Copy, Clone, Debug)] +pub struct LegacyIrqSourceConfig { + pub irqchip: u32, + pub pin: u32, +} + +/// Configuration data for MSI/MSI-X interrupts. +/// +/// On x86 platforms, these interrupts are vectors delivered directly to the LAPIC. +#[derive(Copy, Clone, Debug, Default)] +pub struct MsiIrqSourceConfig { + /// High address to delivery message signaled interrupt. + pub high_addr: u32, + /// Low address to delivery message signaled interrupt. + pub low_addr: u32, + /// Data to write to delivery message signaled interrupt. + pub data: u32, + /// Unique ID of the device to delivery message signaled interrupt. + pub devid: u32, +} + +/// Configuration data for an interrupt source. +#[derive(Copy, Clone, Debug)] +pub enum InterruptSourceConfig { + /// Configuration data for Legacy interrupts. + LegacyIrq(LegacyIrqSourceConfig), + /// Configuration data for PciMsi, PciMsix and generic MSI interrupts. + MsiIrq(MsiIrqSourceConfig), +} + +/// Configuration data for legacy, pin based interrupt groups. +/// +/// A legacy interrupt group only takes one irq number as its configuration. +#[derive(Copy, Clone, Debug)] +pub struct LegacyIrqGroupConfig { + /// Legacy irq number. + pub irq: InterruptIndex, +} + +/// Configuration data for MSI/MSI-X interrupt groups +/// +/// MSI/MSI-X interrupt groups are basically a set of vectors. +#[derive(Copy, Clone, Debug)] +pub struct MsiIrqGroupConfig { + /// First index of the MSI/MSI-X interrupt vectors + pub base: InterruptIndex, + /// Number of vectors in the MSI/MSI-X group. + pub count: InterruptIndex, +} + +/// Trait to manage interrupt sources for virtual device backends. +/// +/// The InterruptManager implementations should protect itself from concurrent accesses internally, +/// so it could be invoked from multi-threaded context. +pub trait InterruptManager: { + type GroupConfig; + + /// Create an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object to manage + /// interrupt sources for a virtual device + /// + /// An [InterruptSourceGroup](trait.InterruptSourceGroup.html) object manages all interrupt + /// sources of the same type for a virtual device. + /// + /// # Arguments + /// * interrupt_type: type of interrupt source. + /// * base: base Interrupt Source ID to be managed by the group object. + /// * count: number of Interrupt Sources to be managed by the group object. + fn create_group(&self, config: Self::GroupConfig) + -> Result>>; + + /// Destroy an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object created by + /// [create_group()](trait.InterruptManager.html#tymethod.create_group). + /// + /// Assume the caller takes the responsibility to disable all interrupt sources of the group + /// before calling destroy_group(). This assumption helps to simplify InterruptSourceGroup + /// implementations. + fn destroy_group(&self, group: Arc>) -> Result<()>; +} + +pub trait InterruptSourceGroup: Send + Sync { + /// Enable the interrupt sources in the group to generate interrupts. + fn enable(&self) -> Result<()> { + // Not all interrupt sources can be enabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Disable the interrupt sources in the group to generate interrupts. + fn disable(&self) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Inject an interrupt from this interrupt source into the guest. + fn trigger(&self, index: InterruptIndex) -> Result<()>; + + /// Returns an interrupt notifier from this interrupt. + /// + /// An interrupt notifier allows for external components and processes + /// to inject interrupts into a guest, by writing to the file returned + /// by this method. + #[allow(unused_variables)] + fn notifier(&self, index: InterruptIndex) -> Option; + + /// Update the interrupt source group configuration. + /// + /// # Arguments + /// * index: sub-index into the group. + /// * config: configuration data for the interrupt source. + fn update(&self, index: InterruptIndex, config: InterruptSourceConfig) -> Result<()>; + + /// Mask an interrupt from this interrupt source. + fn mask(&self, _index: InterruptIndex) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Unmask an interrupt from this interrupt source. + fn unmask(&self, _index: InterruptIndex) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } +} diff --git a/src/vm-device/src/lib.rs b/src/vm-device/src/lib.rs new file mode 100644 index 00000000000..f5977a16144 --- /dev/null +++ b/src/vm-device/src/lib.rs @@ -0,0 +1,54 @@ +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[macro_use] +extern crate serde_derive; +extern crate vm_memory; + +use std::io; + +mod bus; +pub mod dma_mapping; +pub mod interrupt; + +pub use self::bus::{Bus, BusDevice, Error as BusError}; + +#[derive(Debug)] +pub enum Error { + IoError(io::Error), +} + +/// Type of Message Signalled Interrupt +#[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize)] +pub enum MsiIrqType { + /// PCI MSI IRQ numbers. + PciMsi, + /// PCI MSIx IRQ numbers. + PciMsix, + /// Generic MSI IRQ numbers. + GenericMsi, +} + +/// Enumeration for device resources. +#[allow(missing_docs)] +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum Resource { + /// IO Port address range. + PioAddressRange { base: u16, size: u16 }, + /// Memory Mapped IO address range. + MmioAddressRange { base: u64, size: u64 }, + /// Legacy IRQ number. + LegacyIrq(u32), + /// Message Signaled Interrupt + MsiIrq { + ty: MsiIrqType, + base: u32, + size: u32, + }, + /// Network Interface Card MAC address. + MacAddress(String), + /// KVM memslot index. + KvmMemSlot(u32), +} diff --git a/src/vm-system-allocator/Cargo.toml b/src/vm-system-allocator/Cargo.toml new file mode 100644 index 00000000000..68253138453 --- /dev/null +++ b/src/vm-system-allocator/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "vm-system-allocator" +version = "0.1.0" +authors = ["The Chromium OS Authors"] +edition = "2018" + +[dependencies] +libc = "0.2.159" +vm-memory = "0.15.0" diff --git a/src/vm-system-allocator/src/address.rs b/src/vm-system-allocator/src/address.rs new file mode 100644 index 00000000000..30d8ec5ef17 --- /dev/null +++ b/src/vm-system-allocator/src/address.rs @@ -0,0 +1,393 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Copyright © 2019 Intel Corporation +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::collections::btree_map::BTreeMap; +use std::result; +use vm_memory::{Address, GuestAddress, GuestUsize}; + +#[derive(Debug)] +pub enum Error { + Overflow, + Overlap, + UnalignedAddress, +} + +pub type Result = result::Result; + +/// Manages allocating address ranges. +/// Use `AddressAllocator` whenever an address range needs to be allocated to different users. +/// +/// # Examples +/// +/// ``` +/// # use vm_allocator::AddressAllocator; +/// # use vm_memory::{Address, GuestAddress, GuestUsize}; +/// AddressAllocator::new(GuestAddress(0x1000), 0x10000).map(|mut pool| { +/// assert_eq!(pool.allocate(None, 0x110, Some(0x100)), Some(GuestAddress(0x10e00))); +/// assert_eq!(pool.allocate(None, 0x100, Some(0x100)), Some(GuestAddress(0x10d00))); +/// }); +/// ``` +#[derive(Debug, Eq, PartialEq)] +pub struct AddressAllocator { + base: GuestAddress, + end: GuestAddress, + ranges: BTreeMap, +} + +impl AddressAllocator { + /// Creates a new `AddressAllocator` for managing a range of addresses. + /// Can return `None` if `base` + `size` overflows a u64. + /// + /// * `base` - The starting address of the range to manage. + /// * `size` - The size of the address range in bytes. + pub fn new(base: GuestAddress, size: GuestUsize) -> Option { + if size == 0 { + return None; + } + + let end = base.checked_add(size - 1)?; + + let mut allocator = AddressAllocator { + base, + end, + ranges: BTreeMap::new(), + }; + + // Insert the last address as a zero size range. + // This is our end of address space marker. + allocator.ranges.insert(base.checked_add(size)?, 0); + + Some(allocator) + } + + fn align_address(&self, address: GuestAddress, alignment: GuestUsize) -> GuestAddress { + let align_adjust = if address.raw_value() % alignment != 0 { + alignment - (address.raw_value() % alignment) + } else { + 0 + }; + + address.unchecked_add(align_adjust) + } + + fn available_range( + &self, + req_address: GuestAddress, + req_size: GuestUsize, + alignment: GuestUsize, + ) -> Result { + let aligned_address = self.align_address(req_address, alignment); + + // The requested address should be aligned. + if aligned_address != req_address { + return Err(Error::UnalignedAddress); + } + + // The aligned address should be within the address space range. + if aligned_address >= self.end || aligned_address < self.base { + return Err(Error::Overflow); + } + + let mut prev_end_address = self.base; + for (address, size) in self.ranges.iter() { + if aligned_address <= *address { + // Do we overlap with the previous range? + if prev_end_address > aligned_address { + return Err(Error::Overlap); + } + + // Do we have enough space? + if address + .unchecked_sub(aligned_address.raw_value()) + .raw_value() + < req_size + { + return Err(Error::Overlap); + } + + return Ok(aligned_address); + } + + prev_end_address = address.unchecked_add(*size); + } + + // We have not found a range that starts after the requested address, + // despite having a marker at the end of our range. + Err(Error::Overflow) + } + + fn first_available_range( + &self, + req_size: GuestUsize, + alignment: GuestUsize, + ) -> Option { + let reversed_ranges: Vec<(&GuestAddress, &GuestUsize)> = self.ranges.iter().rev().collect(); + + for (idx, (address, _size)) in reversed_ranges.iter().enumerate() { + let next_range_idx = idx + 1; + let prev_end_address = if next_range_idx >= reversed_ranges.len() { + self.base + } else { + reversed_ranges[next_range_idx] + .0 + .unchecked_add(*(reversed_ranges[next_range_idx].1)) + }; + + // If we have enough space between this range and the previous one, + // we return the start of this range minus the requested size. + // As each new range is allocated at the end of the available address space, + // we will tend to always allocate new ranges there as well. In other words, + // ranges accumulate at the end of the address space. + if let Some(size_delta) = + address.checked_sub(self.align_address(prev_end_address, alignment).raw_value()) + { + let adjust = if alignment > 1 { alignment - 1 } else { 0 }; + if size_delta.raw_value() >= req_size { + return Some( + self.align_address(address.unchecked_sub(req_size + adjust), alignment), + ); + } + } + } + + None + } + + /// Allocates a range of addresses from the managed region. Returns `Some(allocated_address)` + /// when successful, or `None` if an area of `size` can't be allocated or if alignment isn't + /// a power of two. + pub fn allocate( + &mut self, + address: Option, + size: GuestUsize, + align_size: Option, + ) -> Option { + if size == 0 { + return None; + } + + let alignment = align_size.unwrap_or(4); + if !alignment.is_power_of_two() || alignment == 0 { + return None; + } + + let new_addr = match address { + Some(req_address) => match self.available_range(req_address, size, alignment) { + Ok(addr) => addr, + Err(_) => { + return None; + } + }, + None => self.first_available_range(size, alignment)?, + }; + + self.ranges.insert(new_addr, size); + + Some(new_addr) + } + + /// Free an already allocated address range. + /// We can only free a range if it matches exactly an already allocated range. + pub fn free(&mut self, address: GuestAddress, size: GuestUsize) { + if let Some(&range_size) = self.ranges.get(&address) { + if size == range_size { + self.ranges.remove(&address); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn new_fails_overflow() { + assert_eq!( + AddressAllocator::new(GuestAddress(u64::max_value()), 0x100), + None + ); + } + + #[test] + fn new_fails_size_zero() { + assert_eq!(AddressAllocator::new(GuestAddress(0x1000), 0), None); + } + + #[test] + fn allocate_fails_alignment_zero() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x10000).unwrap(); + assert_eq!( + pool.allocate(Some(GuestAddress(0x1000)), 0x100, Some(0)), + None + ); + } + + #[test] + fn allocate_fails_alignment_non_power_of_two() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x10000).unwrap(); + assert_eq!( + pool.allocate(Some(GuestAddress(0x1000)), 0x100, Some(200)), + None + ); + } + + #[test] + fn allocate_fails_not_enough_space() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + assert_eq!( + pool.allocate(None, 0x800, Some(0x100)), + Some(GuestAddress(0x1800)) + ); + assert_eq!(pool.allocate(None, 0x900, Some(0x100)), None); + assert_eq!( + pool.allocate(None, 0x400, Some(0x100)), + Some(GuestAddress(0x1400)) + ); + } + + #[test] + fn allocate_alignment() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x10000).unwrap(); + assert_eq!( + pool.allocate(None, 0x110, Some(0x100)), + Some(GuestAddress(0x10e00)) + ); + assert_eq!( + pool.allocate(None, 0x100, Some(0x100)), + Some(GuestAddress(0x10d00)) + ); + assert_eq!( + pool.allocate(None, 0x10, Some(0x100)), + Some(GuestAddress(0x10c00)) + ); + } + + #[test] + fn allocate_address() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, None), + Some(GuestAddress(0x1200)) + ); + + assert_eq!( + pool.allocate(Some(GuestAddress(0x1a00)), 0x100, None), + Some(GuestAddress(0x1a00)) + ); + } + + #[test] + fn allocate_address_alignment() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + + // Unaligned request + assert_eq!( + pool.allocate(Some(GuestAddress(0x1210)), 0x800, Some(0x100)), + None + ); + + // Aligned request + assert_eq!( + pool.allocate(Some(GuestAddress(0x1b00)), 0x100, Some(0x100)), + Some(GuestAddress(0x1b00)) + ); + } + + #[test] + fn allocate_address_not_enough_space() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + + // First range is [0x1200:0x1a00] + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + + // Second range is [0x1c00:0x1e00] + assert_eq!( + pool.allocate(Some(GuestAddress(0x1c00)), 0x200, Some(0x100)), + Some(GuestAddress(0x1c00)) + ); + + // There is 0x200 between the first 2 ranges. + // We ask for an available address but the range is too big + assert_eq!( + pool.allocate(Some(GuestAddress(0x1b00)), 0x800, Some(0x100)), + None + ); + + // We ask for an available address, with a small enough range + assert_eq!( + pool.allocate(Some(GuestAddress(0x1b00)), 0x100, Some(0x100)), + Some(GuestAddress(0x1b00)) + ); + } + + #[test] + fn allocate_address_free_and_realloc() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + + // First range is [0x1200:0x1a00] + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + + pool.free(GuestAddress(0x1200), 0x800); + + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + } + + #[test] + fn allocate_address_free_fail_and_realloc() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + + // First range is [0x1200:0x1a00] + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + + // We try to free a range smaller than the allocated one. + pool.free(GuestAddress(0x1200), 0x100); + + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + None + ); + } + + #[test] + fn allocate_address_fail_free_and_realloc() { + let mut pool = AddressAllocator::new(GuestAddress(0x1000), 0x1000).unwrap(); + + // First allocation fails + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x2000, Some(0x100)), + None + ); + + // We try to free a range that was not allocated. + pool.free(GuestAddress(0x1200), 0x2000); + + // Now we try an allocation that should succeed. + assert_eq!( + pool.allocate(Some(GuestAddress(0x1200)), 0x800, Some(0x100)), + Some(GuestAddress(0x1200)) + ); + } +} diff --git a/src/vm-system-allocator/src/arch/aarch64/layout.rs b/src/vm-system-allocator/src/arch/aarch64/layout.rs new file mode 100644 index 00000000000..922cfbb66e6 --- /dev/null +++ b/src/vm-system-allocator/src/arch/aarch64/layout.rs @@ -0,0 +1,84 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// ==== Address map in use in ARM development systems today ==== +// +// - 32-bit - - 36-bit - - 40-bit - +// 1024GB + + +-------------------+ <- 40-bit +// | | DRAM | +// ~ ~ ~ ~ +// | | | +// | | | +// | | | +// | | | +// 544GB + + +-------------------+ +// | | Hole or DRAM | +// | | | +// 512GB + + +-------------------+ +// | | Mapped | +// | | I/O | +// ~ ~ ~ ~ +// | | | +// 256GB + + +-------------------+ +// | | Reserved | +// ~ ~ ~ ~ +// | | | +// 64GB + +-----------------------+-------------------+ <- 36-bit +// | | DRAM | +// ~ ~ ~ ~ +// | | | +// | | | +// 34GB + +-----------------------+-------------------+ +// | | Hole or DRAM | +// 32GB + +-----------------------+-------------------+ +// | | Mapped I/O | +// ~ ~ ~ ~ +// | | | +// 16GB + +-----------------------+-------------------+ +// | | Reserved | +// ~ ~ ~ ~ +// 4GB +-------------------+-----------------------+-------------------+ <- 32-bit +// | 2GB of DRAM | +// | | +// 2GB +-------------------+-----------------------+-------------------+ +// | Mapped I/O | +// 1GB +-------------------+-----------------------+-------------------+ +// | ROM & RAM & I/O | +// 0GB +-------------------+-----------------------+-------------------+ 0 +// - 32-bit - - 36-bit - - 40-bit - +// +// Taken from (http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf). + +/// Start of RAM on 64 bit ARM. +pub const DRAM_MEM_START: u64 = 0x8000_0000; // 2 GB. +/// The maximum RAM size. +pub const DRAM_MEM_MAX_SIZE: usize = 0x00FF_8000_0000; // 1024 - 2 = 1022G. + +/// Start of RAM on 64 bit ARM. +pub const SYSTEM_MEM_START: u64 = DRAM_MEM_START; + +/// This is used by ACPI device manager for acpi tables or devices like vmgenid +/// In reality, 2MBs is an overkill, but immediately after this we write the kernel +/// image, which needs to be 2MB aligned. +pub const SYSTEM_MEM_SIZE: u64 = 0x20_0000; + +/// Kernel command line maximum size. +/// As per `arch/arm64/include/uapi/asm/setup.h`. +pub const CMDLINE_MAX_SIZE: usize = 2048; + +/// Maximum size of the device tree blob as specified in https://www.kernel.org/doc/Documentation/arm64/booting.txt. +pub const FDT_MAX_SIZE: usize = 0x20_0000; + +// As per virt/kvm/arm/vgic/vgic-kvm-device.c we need +// the number of interrupts our GIC will support to be: +// * bigger than 32 +// * less than 1023 and +// * a multiple of 32. +/// The highest usable SPI on aarch64. +pub const IRQ_MAX: u32 = 128; + +/// First usable interrupt on aarch64. +pub const IRQ_BASE: u32 = 32; + +/// Below this address will reside the GIC, above this address will reside the MMIO devices. +pub const MAPPED_IO_START: u64 = 1 << 30; // 1 GB diff --git a/src/vm-system-allocator/src/arch/aarch64/mod.rs b/src/vm-system-allocator/src/arch/aarch64/mod.rs new file mode 100644 index 00000000000..6bbb87c941b --- /dev/null +++ b/src/vm-system-allocator/src/arch/aarch64/mod.rs @@ -0,0 +1,11 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/// Layout for this aarch64 system. +pub mod layout; +/// Logic for configuring aarch64 registers. + +/// The start of the memory area reserved for MMIO devices. +pub const MMIO_MEM_START: u64 = layout::MAPPED_IO_START; +/// The size of the memory area reserved for MMIO devices. +pub const MMIO_MEM_SIZE: u64 = layout::DRAM_MEM_START - layout::MAPPED_IO_START; //>> 1GB diff --git a/src/vm-system-allocator/src/arch/mod.rs b/src/vm-system-allocator/src/arch/mod.rs new file mode 100644 index 00000000000..d2faa14ae09 --- /dev/null +++ b/src/vm-system-allocator/src/arch/mod.rs @@ -0,0 +1,19 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/// Module for aarch64 related functionality. +#[cfg(target_arch = "aarch64")] +pub mod aarch64; + +#[cfg(target_arch = "aarch64")] +pub use aarch64::{ + layout::CMDLINE_MAX_SIZE, layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, + layout::SYSTEM_MEM_START, +}; + +/// Module for x86_64 related functionality. +#[cfg(target_arch = "x86_64")] +pub mod x86_64; + +#[cfg(target_arch = "x86_64")] +pub use crate::arch::x86_64::{layout::IRQ_BASE, layout::IRQ_MAX,}; diff --git a/src/vm-system-allocator/src/arch/x86_64/layout.rs b/src/vm-system-allocator/src/arch/x86_64/layout.rs new file mode 100644 index 00000000000..1241508efb6 --- /dev/null +++ b/src/vm-system-allocator/src/arch/x86_64/layout.rs @@ -0,0 +1,9 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// + +// Typically, on x86 systems 24 IRQs are used (0-23). +/// First usable IRQ ID for virtio device interrupts on x86_64. +pub const IRQ_BASE: u32 = 5; +/// Last usable IRQ ID for virtio device interrupts on x86_64. +pub const IRQ_MAX: u32 = 23; diff --git a/src/vm-system-allocator/src/arch/x86_64/mod.rs b/src/vm-system-allocator/src/arch/x86_64/mod.rs new file mode 100644 index 00000000000..cc401574613 --- /dev/null +++ b/src/vm-system-allocator/src/arch/x86_64/mod.rs @@ -0,0 +1,9 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +/// Layout for the x86_64 system. +pub mod layout; diff --git a/src/vm-system-allocator/src/gsi.rs b/src/vm-system-allocator/src/gsi.rs new file mode 100644 index 00000000000..ed01bbfd4cb --- /dev/null +++ b/src/vm-system-allocator/src/gsi.rs @@ -0,0 +1,108 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +#[cfg(target_arch = "x86_64")] +use std::collections::btree_map::BTreeMap; +use std::result; + +#[derive(Debug)] +pub enum Error { + Overflow, +} + +pub type Result = result::Result; + +/// GsiApic +#[cfg(target_arch = "x86_64")] +#[derive(Copy, Clone)] +pub struct GsiApic { + base: u32, + irqs: u32, +} + +#[cfg(target_arch = "x86_64")] +impl GsiApic { + /// New GSI APIC + pub fn new(base: u32, irqs: u32) -> Self { + GsiApic { base, irqs } + } +} + +/// GsiAllocator +pub struct GsiAllocator { + #[cfg(target_arch = "x86_64")] + apics: BTreeMap, + next_irq: u32, + next_gsi: u32, +} + +impl GsiAllocator { + #[cfg(target_arch = "x86_64")] + /// New GSI allocator + pub fn new(apics: Vec) -> Self { + let mut allocator = GsiAllocator { + apics: BTreeMap::new(), + next_irq: 0xffff_ffff, + next_gsi: 0, + }; + + for apic in &apics { + if apic.base < allocator.next_irq { + allocator.next_irq = apic.base; + } + + if apic.base + apic.irqs > allocator.next_gsi { + allocator.next_gsi = apic.base + apic.irqs; + } + + allocator.apics.insert(apic.base, apic.irqs); + } + + allocator + } + + #[cfg(target_arch = "aarch64")] + #[allow(clippy::new_without_default)] + /// New GSI allocator + pub fn new() -> Self { + GsiAllocator { + next_irq: arch::IRQ_BASE, + next_gsi: arch::IRQ_BASE, + } + } + + /// Allocate a GSI + pub fn allocate_gsi(&mut self) -> Result { + let gsi = self.next_gsi; + self.next_gsi = self.next_gsi.checked_add(1).ok_or(Error::Overflow)?; + Ok(gsi) + } + + #[cfg(target_arch = "x86_64")] + /// Allocate an IRQ + pub fn allocate_irq(&mut self) -> Result { + let mut irq: u32 = 0; + for (base, irqs) in self.apics.iter() { + // HACKHACK - This only works with 1 single IOAPIC... + if self.next_irq >= *base && self.next_irq < *base + *irqs { + irq = self.next_irq; + self.next_irq += 1; + } + } + + if irq == 0 { + return Err(Error::Overflow); + } + + Ok(irq) + } + + #[cfg(target_arch = "aarch64")] + /// Allocate an IRQ + pub fn allocate_irq(&mut self) -> Result { + let irq = self.next_irq; + self.next_irq = self.next_irq.checked_add(1).ok_or(Error::Overflow)?; + Ok(irq) + } +} diff --git a/src/vm-system-allocator/src/lib.rs b/src/vm-system-allocator/src/lib.rs new file mode 100644 index 00000000000..c911a87792a --- /dev/null +++ b/src/vm-system-allocator/src/lib.rs @@ -0,0 +1,25 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Copyright © 2019 Intel Corporation +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +#![deny(missing_docs)] + +//! Manages system resources that can be allocated to VMs and their devices. + +extern crate libc; +extern crate vm_memory; + +mod address; +mod gsi; +mod system; +mod arch; + +pub use crate::address::AddressAllocator; +pub use crate::gsi::GsiAllocator; +#[cfg(target_arch = "x86_64")] +pub use crate::gsi::GsiApic; +pub use crate::system::SystemAllocator; diff --git a/src/vm-system-allocator/src/system.rs b/src/vm-system-allocator/src/system.rs new file mode 100644 index 00000000000..e73ee8b4844 --- /dev/null +++ b/src/vm-system-allocator/src/system.rs @@ -0,0 +1,162 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Copyright © 2019 Intel Corporation +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use vm_memory::{GuestAddress, GuestUsize}; + +use crate::address::AddressAllocator; +use crate::gsi::GsiAllocator; +#[cfg(target_arch = "x86_64")] +use crate::gsi::GsiApic; + +use libc::{sysconf, _SC_PAGESIZE}; + +/// Safe wrapper for `sysconf(_SC_PAGESIZE)`. +#[inline(always)] +fn pagesize() -> usize { + // Trivially safe + unsafe { sysconf(_SC_PAGESIZE) as usize } +} + +/// Manages allocating system resources such as address space and interrupt numbers. +/// +/// # Example - Use the `SystemAddress` builder. +/// +/// ``` +/// # #[cfg(target_arch = "x86_64")] +/// # use vm_allocator::{GsiApic, SystemAllocator}; +/// # #[cfg(target_arch = "aarch64")] +/// # use vm_allocator::SystemAllocator; +/// # use vm_memory::{Address, GuestAddress, GuestUsize}; +/// let mut allocator = SystemAllocator::new( +/// #[cfg(target_arch = "x86_64")] GuestAddress(0x1000), +/// #[cfg(target_arch = "x86_64")] 0x10000, +/// GuestAddress(0x10000000), 0x10000000, +/// GuestAddress(0x20000000), 0x100000, +/// #[cfg(target_arch = "x86_64")] vec![GsiApic::new(5, 19)]).unwrap(); +/// #[cfg(target_arch = "x86_64")] +/// assert_eq!(allocator.allocate_irq(), Some(5)); +/// #[cfg(target_arch = "aarch64")] +/// assert_eq!(allocator.allocate_irq(), Some(0)); +/// #[cfg(target_arch = "x86_64")] +/// assert_eq!(allocator.allocate_irq(), Some(6)); +/// #[cfg(target_arch = "aarch64")] +/// assert_eq!(allocator.allocate_irq(), Some(1)); +/// assert_eq!(allocator.allocate_mmio_addresses(None, 0x1000, Some(0x1000)), Some(GuestAddress(0x1fff_f000))); +/// +/// ``` +pub struct SystemAllocator { + #[cfg(target_arch = "x86_64")] + io_address_space: AddressAllocator, + mmio_address_space: AddressAllocator, + mmio_hole_address_space: AddressAllocator, + gsi_allocator: GsiAllocator, +} + +impl SystemAllocator { + /// Creates a new `SystemAllocator` for managing addresses and irq numvers. + /// Can return `None` if `base` + `size` overflows a u64 + /// + /// * `io_base` - (X86) The starting address of IO memory. + /// * `io_size` - (X86) The size of IO memory. + /// * `mmio_base` - The starting address of MMIO memory. + /// * `mmio_size` - The size of MMIO memory. + /// * `mmio_hole_base` - The starting address of MMIO memory in 32-bit address space. + /// * `mmio_hole_size` - The size of MMIO memory in 32-bit address space. + /// * `apics` - (X86) Vector of APIC's. + /// + pub fn new( + #[cfg(target_arch = "x86_64")] io_base: GuestAddress, + #[cfg(target_arch = "x86_64")] io_size: GuestUsize, + mmio_base: GuestAddress, + mmio_size: GuestUsize, + mmio_hole_base: GuestAddress, + mmio_hole_size: GuestUsize, + #[cfg(target_arch = "x86_64")] apics: Vec, + ) -> Option { + Some(SystemAllocator { + #[cfg(target_arch = "x86_64")] + io_address_space: AddressAllocator::new(io_base, io_size)?, + mmio_address_space: AddressAllocator::new(mmio_base, mmio_size)?, + mmio_hole_address_space: AddressAllocator::new(mmio_hole_base, mmio_hole_size)?, + #[cfg(target_arch = "x86_64")] + gsi_allocator: GsiAllocator::new(apics), + #[cfg(target_arch = "aarch64")] + gsi_allocator: GsiAllocator::new(), + }) + } + + /// Reserves the next available system irq number. + pub fn allocate_irq(&mut self) -> Option { + self.gsi_allocator.allocate_irq().ok() + } + + /// Reserves the next available GSI. + pub fn allocate_gsi(&mut self) -> Option { + self.gsi_allocator.allocate_gsi().ok() + } + + #[cfg(target_arch = "x86_64")] + /// Reserves a section of `size` bytes of IO address space. + pub fn allocate_io_addresses( + &mut self, + address: Option, + size: GuestUsize, + align_size: Option, + ) -> Option { + self.io_address_space + .allocate(address, size, Some(align_size.unwrap_or(0x1))) + } + + /// Reserves a section of `size` bytes of MMIO address space. + pub fn allocate_mmio_addresses( + &mut self, + address: Option, + size: GuestUsize, + align_size: Option, + ) -> Option { + self.mmio_address_space.allocate( + address, + size, + Some(align_size.unwrap_or(pagesize() as u64)), + ) + } + + /// Reserves a section of `size` bytes of MMIO address space. + pub fn allocate_mmio_hole_addresses( + &mut self, + address: Option, + size: GuestUsize, + align_size: Option, + ) -> Option { + self.mmio_hole_address_space.allocate( + address, + size, + Some(align_size.unwrap_or(pagesize() as u64)), + ) + } + + #[cfg(target_arch = "x86_64")] + /// Free an IO address range. + /// We can only free a range if it matches exactly an already allocated range. + pub fn free_io_addresses(&mut self, address: GuestAddress, size: GuestUsize) { + self.io_address_space.free(address, size) + } + + /// Free an MMIO address range. + /// We can only free a range if it matches exactly an already allocated range. + pub fn free_mmio_addresses(&mut self, address: GuestAddress, size: GuestUsize) { + self.mmio_address_space.free(address, size) + } + + /// Free an MMIO address range from the 32 bits hole. + /// We can only free a range if it matches exactly an already allocated range. + pub fn free_mmio_hole_addresses(&mut self, address: GuestAddress, size: GuestUsize) { + self.mmio_hole_address_space.free(address, size) + } +} diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 7be2200b8b2..aaf1dc8c139 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -16,6 +16,7 @@ aws-lc-rs = { version = "1.10.0", features = ["bindgen"] } base64 = "0.22.1" bincode = "1.2.1" bitflags = "2.6.0" +byteorder = "1.4.3" crc64 = "2.0.0" derive_more = { version = "1.0.0", default-features = false, features = ["from", "display"] } displaydoc = "0.2.5" @@ -32,6 +33,7 @@ log-instrument = { path = "../log-instrument", optional = true } memfd = "0.6.3" micro_http = { git = "https://github.com/firecracker-microvm/micro-http" } +pci = { path = "../pci"} seccompiler = { path = "../seccompiler" } semver = { version = "1.0.23", features = ["serde"] } serde = { version = "1.0.210", features = ["derive", "rc"] } @@ -41,8 +43,11 @@ thiserror = "1.0.64" timerfd = "1.5.0" userfaultfd = "0.8.1" utils = { path = "../utils" } +vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", branch = "main" } vhost = { version = "0.12.0", features = ["vhost-user-frontend"] } vm-allocator = "0.1.0" +vm-system-allocator = { path = "../vm-system-allocator" } +vm-device = { path = "../vm-device"} vm-memory = { version = "0.15.0", features = ["backend-mmap", "backend-bitmap"] } vm-superio = "0.8.0" vmm-sys-util = { version = "0.12.1", features = ["with-serde"] } diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index f5a2f98cb7c..0f944a1c05c 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -1,6 +1,9 @@ // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +#![deny(missing_docs)] +//! Implements platform specific functionality. +//! Supported platforms: x86_64 and aarch64. use std::fmt; use serde::{Deserialize, Serialize}; @@ -25,7 +28,8 @@ pub use crate::arch::x86_64::{ arch_memory_regions, configure_system, get_kernel_start, initrd_load_addr, layout::APIC_ADDR, layout::CMDLINE_MAX_SIZE, layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, ConfigurationError, MMIO_MEM_SIZE, - MMIO_MEM_START, + MMIO_MEM_START, layout::PCI_MMCONFIG_SIZE, + layout::PCI_MMCONFIG_START, layout::MEM_32BIT_DEVICES_START, layout::MEM_32BIT_DEVICES_SIZE }; /// Types of devices that can get attached to this platform. diff --git a/src/vmm/src/arch/x86_64/layout.rs b/src/vmm/src/arch/x86_64/layout.rs index 01355b3018a..74a61149237 100644 --- a/src/vmm/src/arch/x86_64/layout.rs +++ b/src/vmm/src/arch/x86_64/layout.rs @@ -66,3 +66,22 @@ pub const SYSTEM_MEM_START: u64 = 0x9fc00; /// 257KiB is more than we need, however we reserve this space for potential future use of /// ACPI features (new tables and/or devices). pub const SYSTEM_MEM_SIZE: u64 = RSDP_ADDR - SYSTEM_MEM_START; + +// ** 32-bit reserved area (start: 3GiB, length: 1GiB) ** +/// MEM_32BIT_RESERVED_START +pub const MEM_32BIT_RESERVED_START: u64 = 0xc000_0000; + +/// MEM_32BIT_RESERVED_SIZE +pub const MEM_32BIT_RESERVED_SIZE: u64 = 1024 << 20; + +// Sub range: 32-bit PCI devices (start: 3GiB, length: 640Mib) +/// MEM_32BIT_DEVICES_START +pub const MEM_32BIT_DEVICES_START: u64 = MEM_32BIT_RESERVED_START; +/// MEM_32BIT_DEVICES_SIZE +pub const MEM_32BIT_DEVICES_SIZE: u64 = 640 << 20; + +// PCI MMCONFIG space (start: after the device space, length: 256MiB) +/// PCI_MMCONFIG_START +pub const PCI_MMCONFIG_START: u64 = MEM_32BIT_DEVICES_START + MEM_32BIT_DEVICES_SIZE; +/// PCI_MMCONFIG_SIZE +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; \ No newline at end of file diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 760c22a27b5..a7bd7727ec8 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -5,13 +5,17 @@ #[cfg(target_arch = "x86_64")] use std::convert::TryFrom; -use std::fmt::Debug; -use std::io::{self, Seek, SeekFrom}; +use std::fmt::{Debug, Display, Formatter}; +use std::io::{self, Read, Seek, SeekFrom}; +use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; +use std::path::Path; #[cfg(feature = "gdb")] use std::sync::mpsc; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; +use kvm_bindings::{kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO}; +use kvm_ioctls::{DeviceFd, VmFd}; use libc::EFD_NONBLOCK; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; #[cfg(target_arch = "x86_64")] @@ -19,9 +23,14 @@ use linux_loader::loader::elf::Elf as Loader; #[cfg(target_arch = "aarch64")] use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::KernelLoader; +use pci::{PciDevice, VfioPciDevice}; use seccompiler::BpfThreadMap; use userfaultfd::Uffd; use utils::time::TimestampUs; +use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd}; +use vm_allocator::AddressAllocator; +use vm_system_allocator::{GsiApic, SystemAllocator}; +use vm_device::interrupt::{InterruptManager, MsiIrqGroupConfig}; use vm_memory::ReadVolatile; #[cfg(target_arch = "aarch64")] use vm_superio::Rtc; @@ -53,6 +62,7 @@ use crate::devices::legacy::serial::SerialOut; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; use crate::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; +use crate::devices::pci::{PciBus, PciConfigIo, PciConfigMmio, PciRoot}; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; @@ -63,7 +73,8 @@ use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; use crate::devices::BusDevice; #[cfg(feature = "gdb")] use crate::gdb; -use crate::logger::{debug, error}; +use crate::interrupt::MsiInterruptManager; +use crate::logger::{debug, info, error}; use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::snapshot::Persist; @@ -71,7 +82,7 @@ use crate::utils::u64_to_usize; use crate::vmm_config::boot_source::BootConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::{VmConfig, VmConfigError}; -use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; +use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuError}; use crate::vstate::vm::Vm; use crate::{device_manager, EventManager, Vmm, VmmError}; @@ -151,6 +162,138 @@ impl std::convert::From for StartMicrovmError { } } +fn create_passthrough_device(vm: &VmFd) -> DeviceFd { + let mut vfio_dev = kvm_create_device { + type_: kvm_device_type_KVM_DEV_TYPE_VFIO, + fd: 0, + flags: 0, + }; + + vm.create_device(&mut vfio_dev).unwrap() +} + +fn add_vfio_device( + vm: Arc>, + fd: DeviceFd, + pci: Arc>, + dev_manager: &mut MMIODeviceManager, + pio_manager: &mut PortIODeviceManager, + interrupt_manager: Arc>, + memory: GuestMemoryMmap, + allocator: Arc> +) { + // We need to shift the device id since the 3 first bits + // are dedicated to the PCI function, and we know we don't + // do multifunction. Also, because we only support one PCI + // bus, the bus 0, we don't need to add anything to the + // global device ID. + let pci_device_bdf = pci.lock().expect("bad lock").next_device_id().unwrap() << 3; + + // Safe because we know the RawFd is valid. + // + // This dup() is mandatory to be able to give full ownership of the + // file descriptor to the DeviceFd::from_raw_fd() function later in + // the code. + // + // This is particularly needed so that VfioContainer will still have + // a valid file descriptor even if DeviceManager, and therefore the + // passthrough_device are dropped. In case of Drop, the file descriptor + // would be closed, but Linux would still have the duplicated file + // descriptor opened from DeviceFd, preventing from unexpected behavior + // where the VfioContainer would try to use a closed file descriptor. + let dup_device_fd = unsafe { libc::dup(fd.as_raw_fd()) }; + + // SAFETY the raw fd conversion here is safe because: + // 1. This function is only called on KVM, see the feature guard above. + // 2. When running on KVM, passthrough_device wraps around DeviceFd. + // 3. The conversion here extracts the raw fd and then turns the raw fd into a DeviceFd + // of the same (correct) type. + let vfio_container = Arc::new( + VfioContainer::new(Some(Arc::new(VfioDeviceFd::new_from_kvm(unsafe { DeviceFd::from_raw_fd(dup_device_fd) })))).unwrap(), + ); + let vfio_device = VfioDevice::new( + // T4 GPU on g4dn.metal intance. + // Path::new("/sys/bus/pci/drivers/vfio-pci/0000:18:00.0"), + Path::new("/sys/bus/pci/devices/0000:18:00.0/"), + // Path::new("/sys/bus/pci/drivers/vfio-pci/0000:bf:00.1"), + Arc::clone(&vfio_container), + ) + .unwrap(); + + + let vfio_pci_device = + BusDevice::VfioPciDevice(VfioPciDevice::new(vm, vfio_device, vfio_container.clone(), &interrupt_manager, None, false).unwrap()); + + let vfio_pci_device = Arc::new(Mutex::new(vfio_pci_device)); + let bars = vfio_pci_device + .lock() + .expect("bad lock") + .vfio_pci_device_mut() + .unwrap() + .allocate_bars(&mut allocator.lock().expect("Poisoned lock")) + .unwrap(); + + // Register DMA mapping in IOMMU. + for (index, region) in memory.iter().enumerate() { + info!( + "Mapping DMA for {:x} len {:x} at hva {:x}", + region.start_addr().0, + region.len() as u64, + // memory.get_host_address(region.start_addr()).unwrap() as u64 + region.as_ptr() as u64 + ); + vfio_pci_device.lock().expect("poisoned lock") + .vfio_pci_device_ref() + .unwrap() + .dma_map( + region.start_addr().0, + region.len() as u64, + // memory.get_host_address(region.start_addr()).unwrap() as u64, + region.as_ptr() as u64 + ); + // vfio_container.vfio_dma_map( + // region.start_addr().0, + // region.len() as u64, + // memory.get_host_address(region.start_addr()).unwrap() as u64, + // ) + } + + vfio_pci_device + .lock() + .expect("bad lock") + .vfio_pci_device_mut() + .unwrap() + .map_mmio_regions() + .unwrap(); + + pci.lock() + .expect("bad lock") + .add_device(pci_device_bdf, vfio_pci_device.clone()) + .unwrap(); + + pci.lock() + .expect("bad lock") + .register_mapping( + vfio_pci_device.clone(), + #[cfg(target_arch = "x86_64")] + &mut pio_manager.io_bus, + &mut dev_manager.bus, + bars.clone(), + ) + .unwrap(); + + // Need to register bus mappings ? +} + +// The MMIO address space size is subtracted with 64k. This is done for the +// following reasons: +// - Reduce the addressable space size by at least 4k to workaround a Linux +// bug when the VMM allocates devices at the end of the addressable space +// - Windows requires the addressable space size to be 64k aligned +fn mmio_address_space_size(phys_bits: u8) -> u64 { + (1 << phys_bits) - (1 << 16) +} + #[cfg_attr(target_arch = "aarch64", allow(unused))] fn create_vmm_and_vcpus( instance_info: &InstanceInfo, @@ -165,7 +308,7 @@ fn create_vmm_and_vcpus( // Set up Kvm Vm and register memory regions. // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(kvm_capabilities) + let (mut vm, extra_fd) = Vm::new(kvm_capabilities) .map_err(VmmError::Vm) .map_err(StartMicrovmError::Internal)?; vm.memory_init(&guest_memory, track_dirty_pages) @@ -176,18 +319,63 @@ fn create_vmm_and_vcpus( .map_err(VmmError::EventFd) .map_err(Internal)?; + // Create a system resources allocator. + const NUM_IOAPIC_PINS: usize = 24; + const X86_64_IRQ_BASE: u32 = 5; + + let allocator = Arc::new(Mutex::new( + SystemAllocator::new( + #[cfg(target_arch = "x86_64")] + { + GuestAddress(0) + }, + #[cfg(target_arch = "x86_64")] + { + 1 << 16 + }, + GuestAddress(0), + mmio_address_space_size(46), + GuestAddress(crate::arch::MEM_32BIT_DEVICES_START), + crate::arch::MEM_32BIT_DEVICES_SIZE, + #[cfg(target_arch = "x86_64")] + vec![GsiApic::new( + X86_64_IRQ_BASE, + NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, + )], + ) + .unwrap() + )); + + let vm_fd = Arc::new(Mutex::new(extra_fd)); + // First we create the MSI interrupt manager, the legacy one is created + // later, after the IOAPIC device creation. + // The reason we create the MSI one first is because the IOAPIC needs it, + // and then the legacy interrupt manager needs an IOAPIC. So we're + // handling a linear dependency chain: + // msi_interrupt_manager <- IOAPIC <- legacy_interrupt_manager. + let msi_interrupt_manager: Arc> = + Arc::new(MsiInterruptManager::new( + Arc::clone(&allocator), + Arc::clone(&vm_fd), + )); + let resource_allocator = ResourceAllocator::new()?; // Instantiate the MMIO device manager. - let mmio_device_manager = MMIODeviceManager::new(); + let mut mmio_device_manager = MMIODeviceManager::new(); // Instantiate ACPI device manager. let acpi_device_manager = ACPIDeviceManager::new(); + let pci_root = BusDevice::PciRoot(PciRoot::new(None)); + let pci_bus = PciBus::new(pci_root); + + let pci_bus = Arc::new(Mutex::new(pci_bus)); + // For x86_64 we need to create the interrupt controller before calling `KVM_CREATE_VCPUS` // while on aarch64 we need to do it the other way around. #[cfg(target_arch = "x86_64")] - let (vcpus, pio_device_manager) = { + let (mut vcpus, pio_device_manager) = { setup_interrupt_controller(&mut vm)?; let vcpus = create_vcpus(&vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; @@ -204,10 +392,13 @@ fn create_vmm_and_vcpus( .map_err(VmmError::EventFd) .map_err(Internal)?; + let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&pci_bus))))); + + // create pio dev manager with legacy devices let pio_device_manager = { // TODO Remove these unwraps. - let mut pio_dev_mgr = PortIODeviceManager::new(serial_device, reset_evt).unwrap(); + let mut pio_dev_mgr = PortIODeviceManager::new(serial_device, reset_evt, pci_config_io).unwrap(); pio_dev_mgr.register_devices(vm.fd()).unwrap(); pio_dev_mgr }; @@ -215,6 +406,26 @@ fn create_vmm_and_vcpus( (vcpus, pio_device_manager) }; + // Create passthru device for a GPU. + // let device_fd = create_passthrough_device(vm.fd()); + + // add_vfio_device( + // Arc::clone(&vm_fd), + // device_fd, + // Arc::clone(&pci_bus), + // &mut mmio_device_manager, + // &mut pio_device_manager, + // Arc::clone(&msi_interrupt_manager), + // guest_memory.clone(), + // Arc::clone(&allocator) + // ); + + vcpus = create_vcpus(&vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; + let pci_config_mmio = Arc::new(Mutex::new(BusDevice::MmioPciBus(PciConfigMmio::new(Arc::clone(&pci_bus))))); + mmio_device_manager + .register_pci_bus(pci_config_mmio) + .unwrap(); + // On aarch64, the vCPUs need to be created (i.e call KVM_CREATE_VCPU) before setting up the // IRQ chip because the `KVM_CREATE_VCPU` ioctl will return error if the IRQCHIP // was already initialized. @@ -1033,6 +1244,7 @@ pub mod tests { use super::*; use crate::arch::DeviceType; use crate::device_manager::resources::ResourceAllocator; + use crate::devices::bus::DummyDevice; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::rng::device::ENTROPY_DEV_ID; use crate::devices::virtio::vsock::{TYPE_VSOCK, VSOCK_DEV_ID}; @@ -1109,10 +1321,11 @@ pub mod tests { .map_err(StartMicrovmError::Internal) .unwrap(); - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_memory, false).unwrap(); let mmio_device_manager = MMIODeviceManager::new(); let acpi_device_manager = ACPIDeviceManager::new(); + let pci_bus = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice{}))); #[cfg(target_arch = "x86_64")] let pio_device_manager = PortIODeviceManager::new( Arc::new(Mutex::new(BusDevice::Serial(SerialWrapper { @@ -1126,6 +1339,7 @@ pub mod tests { input: None, }))), EventFd::new(libc::EFD_NONBLOCK).unwrap(), + pci_bus, ) .unwrap(); @@ -1361,7 +1575,7 @@ pub mod tests { let guest_memory = arch_mem(128 << 20); #[allow(unused_mut)] - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_memory, false).unwrap(); let evfd = EventFd::new(libc::EFD_NONBLOCK).unwrap(); diff --git a/src/vmm/src/cpu_config/x86_64/cpuid/intel/normalize.rs b/src/vmm/src/cpu_config/x86_64/cpuid/intel/normalize.rs index 74536e44241..8b3b5efdbae 100644 --- a/src/vmm/src/cpu_config/x86_64/cpuid/intel/normalize.rs +++ b/src/vmm/src/cpu_config/x86_64/cpuid/intel/normalize.rs @@ -46,8 +46,8 @@ pub enum DeterministicCacheError { /// We always use this brand string. pub const DEFAULT_BRAND_STRING: &[u8; BRAND_STRING_LENGTH] = - b"Intel(R) Xeon(R) Processor\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; -pub const DEFAULT_BRAND_STRING_BASE: &[u8; 28] = b"Intel(R) Xeon(R) Processor @"; + b"Intel(R) Xeon(R) Platinum 8259CL CPU\0\0\0\0\0\0\0\0\0\0\0\0"; +pub const DEFAULT_BRAND_STRING_BASE: &[u8; 38] = b"Intel(R) Xeon(R) Platinum 8259CL CPU @"; // We use this 2nd implementation so we can conveniently define functions only used within // `normalize`. diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index dc46a5172d3..7130eb44b1f 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -38,6 +38,7 @@ pub struct PortIODeviceManager { pub stdio_serial: Arc>, // BusDevice::I8042Device pub i8042: Arc>, + pub pci_bus: Arc>, // Communication event on ports 1 & 3. pub com_evt_1_3: EventFdTrigger, @@ -74,6 +75,7 @@ impl PortIODeviceManager { pub fn new( serial: Arc>, i8042_reset_evfd: EventFd, + pci_bus: Arc>, ) -> Result { debug_assert!(matches!(*serial.lock().unwrap(), BusDevice::Serial(_))); let io_bus = crate::devices::Bus::new(); @@ -96,6 +98,7 @@ impl PortIODeviceManager { io_bus, stdio_serial: serial, i8042, + pci_bus, com_evt_1_3, com_evt_2_4, kbd_evt, @@ -124,6 +127,11 @@ impl PortIODeviceManager { ), input: None, }))); + self.io_bus.insert( + self.pci_bus.clone(), + 0xcf8, + 0x8 + )?; self.io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], @@ -238,13 +246,14 @@ impl PortIODeviceManager { #[cfg(test)] mod tests { use super::*; + use crate::devices::bus::DummyDevice; use crate::test_utils::single_region_mem; use crate::Vm; #[test] fn test_register_legacy_devices() { let guest_mem = single_region_mem(0x1000); - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); crate::builder::setup_interrupt_controller(&mut vm).unwrap(); let mut ldm = PortIODeviceManager::new( @@ -259,6 +268,7 @@ mod tests { input: None, }))), EventFd::new(libc::EFD_NONBLOCK).unwrap(), + Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice{}))) ) .unwrap(); ldm.register_devices(vm.fd()).unwrap(); diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index cba9047d564..9a1f2cd505c 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -22,7 +22,7 @@ use vm_allocator::AllocPolicy; use super::resources::ResourceAllocator; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::DeviceInfoForFDT; -use crate::arch::DeviceType; +use crate::arch::{self, DeviceType}; use crate::arch::DeviceType::Virtio; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; @@ -35,7 +35,7 @@ use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend, TYPE_VSOCK}; use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; -use crate::devices::BusDevice; +use crate::devices::{BusDevice, BusError}; #[cfg(target_arch = "x86_64")] use crate::vstate::memory::GuestAddress; @@ -113,6 +113,7 @@ fn add_virtio_aml(dsdt_data: &mut Vec, addr: u64, len: u64, irq: u32) { #[derive(Debug)] pub struct MMIODeviceManager { pub(crate) bus: crate::devices::Bus, + pci_bus: Option>>, pub(crate) id_to_dev_info: HashMap<(DeviceType, String), MMIODeviceInfo>, // We create the AML byte code for every VirtIO device in the order we build // it, so that we ensure the root block device is appears first in the DSDT. @@ -129,6 +130,7 @@ impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { MMIODeviceManager { + pci_bus: None, bus: crate::devices::Bus::new(), id_to_dev_info: HashMap::new(), #[cfg(target_arch = "x86_64")] @@ -155,6 +157,19 @@ impl MMIODeviceManager { Ok(device_info) } + /// Register the PCI bus. + pub fn register_pci_bus(&mut self, pci_bus: Arc>) -> Result<(), MmioError> { + self.bus + .insert( + Arc::clone(&pci_bus), + arch::PCI_MMCONFIG_START, + arch::PCI_MMCONFIG_SIZE, + ) + .map_err(MmioError::BusInsert)?; + self.pci_bus = Some(pci_bus); + Ok(()) + } + /// Register a device at some MMIO address. fn register_mmio_device( &mut self, @@ -364,7 +379,7 @@ impl MMIODeviceManager { .id_to_dev_info .get(&(device_type, device_id.to_string())) { - if let Some((_, device)) = self.bus.get_device(device_info.addr) { + if let Some((_, _, device)) = self.bus.get_device(device_info.addr) { return Some(device); } } @@ -653,7 +668,7 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -682,7 +697,7 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -736,7 +751,7 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mem_clone = guest_mem.clone(); diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index 2b016d73083..4a62e6e7630 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -14,7 +14,7 @@ use std::sync::{Arc, Mutex}; /// Errors triggered during bus operations. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum BusError { - /// New device overlaps with an old device. + /// The insertion failed because the new device overlapped with an old device. Overlap, } @@ -51,10 +51,12 @@ pub struct Bus { } use event_manager::{EventOps, Events, MutEventSubscriber}; +use pci::{PciDevice, VfioPciDevice}; #[cfg(target_arch = "aarch64")] use super::legacy::RTCDevice; use super::legacy::{I8042Device, SerialDevice}; +use super::pci::{PciConfigIo, PciConfigMmio, PciRoot}; use super::pseudo::BootTimer; use super::virtio::mmio::MmioTransport; @@ -66,6 +68,10 @@ pub enum BusDevice { BootTimer(BootTimer), MmioTransport(MmioTransport), Serial(SerialDevice), + PciRoot(PciRoot), + PioPciBus(PciConfigIo), + MmioPciBus(PciConfigMmio), + VfioPciDevice(VfioPciDevice), #[cfg(test)] Dummy(DummyDevice), #[cfg(test)] @@ -165,8 +171,68 @@ impl BusDevice { _ => None, } } + pub fn vfio_pci_device_ref(&self) -> Option<&VfioPciDevice> { + match self { + Self::VfioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn vfio_pci_device_mut(&mut self) -> Option<&mut VfioPciDevice> { + match self { + Self::VfioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn pci_device_ref(&self) -> Option<&dyn PciDevice> { + match self { + Self::VfioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn pci_device_mut(&mut self) -> Option<&mut dyn PciDevice> { + match self { + Self::VfioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn pci_config_io_ref(&self) -> Option<&PciConfigIo> { + match self { + Self::PioPciBus(x) => Some(x), + _ => None, + } + } + pub fn pci_config_io_mut(&mut self) -> Option<&mut PciConfigIo> { + match self { + Self::PioPciBus(x) => Some(x), + _ => None, + } + } + pub fn pci_config_mmio_ref(&self) -> Option<&PciConfigMmio> { + match self { + Self::MmioPciBus(x) => Some(x), + _ => None, + } + } + pub fn pci_config_mmio_mut(&mut self) -> Option<&mut PciConfigMmio> { + match self { + Self::MmioPciBus(x) => Some(x), + _ => None, + } + } + pub fn pci_root_ref(&self) -> Option<&PciRoot> { + match self { + Self::PciRoot(x) => Some(x), + _ => None, + } + } + pub fn pci_root_mut(&mut self) -> Option<&mut PciRoot> { + match self { + Self::PciRoot(x) => Some(x), + _ => None, + } + } - pub fn read(&mut self, offset: u64, data: &mut [u8]) { + pub fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { match self { Self::I8042Device(x) => x.bus_read(offset, data), #[cfg(target_arch = "aarch64")] @@ -174,6 +240,10 @@ impl BusDevice { Self::BootTimer(x) => x.bus_read(offset, data), Self::MmioTransport(x) => x.bus_read(offset, data), Self::Serial(x) => x.bus_read(offset, data), + Self::VfioPciDevice(x) => x.bus_read(base, offset, data), + Self::MmioPciBus(x) => x.bus_read(base, offset, data), + Self::PioPciBus(x) => x.bus_read(base, offset, data), + Self::PciRoot(x) => (), #[cfg(test)] Self::Dummy(x) => x.bus_read(offset, data), #[cfg(test)] @@ -181,7 +251,7 @@ impl BusDevice { } } - pub fn write(&mut self, offset: u64, data: &[u8]) { + pub fn write(&mut self, base: u64, offset: u64, data: &[u8]) { match self { Self::I8042Device(x) => x.bus_write(offset, data), #[cfg(target_arch = "aarch64")] @@ -189,6 +259,10 @@ impl BusDevice { Self::BootTimer(x) => x.bus_write(offset, data), Self::MmioTransport(x) => x.bus_write(offset, data), Self::Serial(x) => x.bus_write(offset, data), + Self::VfioPciDevice(x) => x.bus_write(base, offset, data), + Self::MmioPciBus(x) => x.bus_write(base, offset, data), + Self::PioPciBus(x) => x.bus_write(base, offset, data), + Self::PciRoot(x) => (), #[cfg(test)] Self::Dummy(x) => x.bus_write(offset, data), #[cfg(test)] @@ -230,12 +304,11 @@ impl Bus { None } - /// Returns the device found at some address. - pub fn get_device(&self, addr: u64) -> Option<(u64, &Mutex)> { + pub fn get_device(&self, addr: u64) -> Option<(u64, u64, &Mutex)> { if let Some((BusRange(start, len), dev)) = self.first_before(addr) { let offset = addr - start; if offset < len { - return Some((offset, dev)); + return Some((start, offset, dev)); } } None @@ -280,11 +353,11 @@ impl Bus { /// /// Returns true on success, otherwise `data` is untouched. pub fn read(&self, addr: u64, data: &mut [u8]) -> bool { - if let Some((offset, dev)) = self.get_device(addr) { + if let Some((base, offset, dev)) = self.get_device(addr) { // OK to unwrap as lock() failing is a serious error condition and should panic. dev.lock() .expect("Failed to acquire device lock") - .read(offset, data); + .read(base, offset, data); true } else { false @@ -295,11 +368,11 @@ impl Bus { /// /// Returns true on success, otherwise `data` is untouched. pub fn write(&self, addr: u64, data: &[u8]) -> bool { - if let Some((offset, dev)) = self.get_device(addr) { + if let Some((base, offset, dev)) = self.get_device(addr) { // OK to unwrap as lock() failing is a serious error condition and should panic. dev.lock() .expect("Failed to acquire device lock") - .write(offset, data); + .write(base, offset, data); true } else { false diff --git a/src/vmm/src/devices/legacy/rtc_pl031.rs b/src/vmm/src/devices/legacy/rtc_pl031.rs index 15e20f81446..e0fc7aec375 100644 --- a/src/vmm/src/devices/legacy/rtc_pl031.rs +++ b/src/vmm/src/devices/legacy/rtc_pl031.rs @@ -77,7 +77,7 @@ impl std::ops::DerefMut for RTCDevice { // Implements Bus functions for AMBA PL031 RTC device impl RTCDevice { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { + pub fn bus_read(&mut self, _: u64, offset: u64, data: &mut [u8]) { if let (Ok(offset), 4) = (u16::try_from(offset), data.len()) { // read() function from RTC implementation expects a slice of // len 4, and we just validated that this is the data lengt @@ -92,7 +92,7 @@ impl RTCDevice { } } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + pub fn bus_write(&mut self, _: u64, offset: u64, data: &[u8]) { if let (Ok(offset), 4) = (u16::try_from(offset), data.len()) { // write() function from RTC implementation expects a slice of // len 4, and we just validated that this is the data length diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 0ca445b6f82..656644da60c 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -14,6 +14,8 @@ pub mod bus; pub mod legacy; pub mod pseudo; pub mod virtio; +/// PCI Devices +pub mod pci; pub use bus::{Bus, BusDevice, BusError}; use log::error; diff --git a/src/vmm/src/devices/pci.rs b/src/vmm/src/devices/pci.rs new file mode 100644 index 00000000000..5f4536a44a1 --- /dev/null +++ b/src/vmm/src/devices/pci.rs @@ -0,0 +1,493 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. + +use pci::configuration::{ + PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, +}; +use pci::device::PciDevice; +use byteorder::{ByteOrder, LittleEndian}; +use log::error; +use std::any::Any; +use std::collections::HashMap; +use std::fmt; +use std::fmt::{Debug, Formatter}; +use std::sync::{Arc, Barrier, Mutex}; +use vm_memory::{Address, GuestAddress, GuestUsize}; + +use super::{Bus, BusDevice}; + +const VENDOR_ID_INTEL: u16 = 0x8086; +const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; +const NUM_DEVICE_IDS: usize = 32; + +/// Errors for device manager. +#[derive(Debug)] +pub enum PciRootError { + /// Could not allocate an IRQ number. + AllocateIrq, + /// Could not find an available device slot on the PCI bus. + NoPciDeviceSlotAvailable, + /// Invalid PCI device identifier provided. + InvalidPciDeviceSlot(usize), + /// Valid PCI device identifier but already used. + AlreadyInUsePciDeviceSlot(usize), +} +pub type Result = std::result::Result; + +/// Emulates the PCI Root bridge device. +pub struct PciRoot { + /// Configuration space. + config: PciConfiguration, +} + +impl Debug for PciRoot { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.debug_struct("PciRoot") + .finish() + } +} + +impl PciRoot { + /// Create an empty PCI root bridge. + pub fn new(config: Option) -> Self { + if let Some(config) = config { + PciRoot { config } + } else { + PciRoot { + config: PciConfiguration::new( + VENDOR_ID_INTEL, + DEVICE_ID_INTEL_VIRT_PCIE_HOST, + 0, + PciClassCode::BridgeDevice, + &PciBridgeSubclass::HostBridge, + None, + PciHeaderType::Device, + 0, + 0, + None, + ), + } + } + } +} + +impl PciDevice for PciRoot { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.config.write_config_register(reg_idx, offset, data); + None + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.config.read_reg(reg_idx) + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } +} + +pub struct PciBus { + /// Devices attached to this bus. + /// Device 0 is host bridge. + devices: HashMap>>, + device_ids: Vec, +} + +impl PciBus { + pub fn new(pci_root: BusDevice) -> Self { + let mut devices: HashMap>> = HashMap::new(); + let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; + + devices.insert(0, Arc::new(Mutex::new(pci_root))); + device_ids[0] = true; + + PciBus { + devices, + device_ids, + } + } + + pub fn register_mapping( + &self, + dev: Arc>, + io_bus: &mut Bus, + mmio_bus: &mut Bus, + bars: Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, + ) -> Result<()> { + for (address, size, type_) in bars { + match type_ { + PciBarRegionType::IoRegion => { + io_bus + .insert(dev.clone(), address.raw_value(), size) + .unwrap(); + error!("cannot register bus mappings {:x} {:x} IO", address.0, size); + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + error!("Registering bus mappings {:x} {:x}", address.0, size); + mmio_bus + .insert(dev.clone(), address.raw_value(), size) + .unwrap(); + } + } + } + Ok(()) + } + + pub fn add_device( + &mut self, + pci_device_bdf: u32, + device: Arc>, + ) -> Result<()> { + self.devices.insert(pci_device_bdf >> 3, device); + Ok(()) + } + + pub fn remove_by_device(&mut self, device: &Arc>) -> Result<()> { + self.devices.retain(|_, dev| !Arc::ptr_eq(dev, device)); + Ok(()) + } + + pub fn next_device_id(&mut self) -> Result { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u32); + } + } + + Err(PciRootError::NoPciDeviceSlotAvailable) + } + + pub fn get_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + if !self.device_ids[id] { + self.device_ids[id] = true; + Ok(()) + } else { + Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } + + pub fn put_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + self.device_ids[id] = false; + Ok(()) + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } +} + +pub struct PciConfigIo { + /// Config space register. + config_address: u32, + pci_bus: Arc>, +} + +impl Debug for PciConfigIo { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.debug_struct("PciConfigIo") + .finish() + } +} + +impl PciConfigIo { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigIo { + pci_bus, + config_address: 0, + } + } + + pub fn config_space_read(&self) -> u32 { + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return 0xffff_ffff; + } + + let (bus, device, function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + error!( + "config space read {}:{}:{} reg {}", + bus, device, function, register + ); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + // Don't support multi-function devices. + if function > 0 { + return 0xffff_ffff; + } + + self.pci_bus + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().pci_device_mut().unwrap().read_config_register(register) + }) + } + + pub fn config_space_write(&mut self, offset: u64, data: &[u8]) -> Option> { + if offset as usize + data.len() > 4 { + return None; + } + + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return None; + } + + let (bus, device, _function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return None; + } + + let pci_bus = self.pci_bus.lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + if let Some(params) = device.pci_device_mut().unwrap().detect_bar_reprogramming(register, data) { + // if let Err(e) = pci_bus.device_reloc.move_bar( + // params.old_base, + // params.new_base, + // params.len, + // device.deref_mut(), + // params.region_type, + // ) { + // error!( + // "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + // e, params.old_base, params.new_base, params.len + // ); + // } + error!( + "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", + params.old_base, params.new_base, params.len + ); + } + // Update the register value + device.pci_device_mut().unwrap().write_config_register(register, offset, data) + } else { + None + } + } + + fn set_config_address(&mut self, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + let (mask, value): (u32, u32) = match data.len() { + 1 => ( + 0x0000_00ff << (offset * 8), + u32::from(data[0]) << (offset * 8), + ), + 2 => ( + 0x0000_ffff << (offset * 16), + (u32::from(data[1]) << 8 | u32::from(data[0])) << (offset * 16), + ), + 4 => (0xffff_ffff, LittleEndian::read_u32(data)), + _ => return, + }; + self.config_address = (self.config_address & !mask) | value; + } +} + +impl PciConfigIo { + pub fn bus_read(&mut self, _: u64, offset: u64, data: &mut [u8]) { + // `offset` is relative to 0xcf8 + let value = match offset { + 0..=3 => self.config_address, + 4..=7 => self.config_space_read(), + _ => 0xffff_ffff, + }; + + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end <= 4 { + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } else { + for d in data { + *d = 0xff; + } + } + } + + pub fn bus_write(&mut self, _: u64, offset: u64, data: &[u8]) { + // `offset` is relative to 0xcf8 + match offset { + o @ 0..=3 => { + self.set_config_address(o, data); + } + o @ 4..=7 => { + self.config_space_write(o - 4, data); + } + _ => {} + } + } +} + +/// Emulates PCI memory-mapped configuration access mechanism. +pub struct PciConfigMmio { + pci_bus: Arc>, +} + +impl Debug for PciConfigMmio { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.debug_struct("PciConfigMmio") + .finish() + } +} + +impl PciConfigMmio { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigMmio { pci_bus } + } + + fn config_space_read(&self, config_address: u32) -> u32 { + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + self.pci_bus + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().pci_device_mut().unwrap().read_config_register(register) + }) + } + + fn config_space_write(&mut self, config_address: u32, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return; + } + + let pci_bus = self.pci_bus.lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + if let Some(params) = device.pci_device_mut().unwrap().detect_bar_reprogramming(register, data) { + // if let Err(e) = pci_bus.device_reloc.move_bar( + // params.old_base, + // params.new_base, + // params.len, + // device.deref_mut(), + // params.region_type, + // ) { + // error!( + // "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + // e, params.old_base, params.new_base, params.len + // ); + // } + error!( + "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", + params.old_base, params.new_base, params.len + ); + } + + // Update the register value + device.pci_device_mut().unwrap().write_config_register(register, offset, data); + } + } +} + +impl PciConfigMmio { + pub fn bus_read(&mut self, _: u64, offset: u64, data: &mut [u8]) { + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end > 4 || offset > u64::from(u32::max_value()) { + for d in data { + *d = 0xff; + } + return; + } + + let value = self.config_space_read(offset as u32); + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } + + pub fn bus_write(&mut self, _: u64, offset: u64, data: &[u8]) { + if offset > u64::from(u32::max_value()) { + return; + } + self.config_space_write(offset as u32, offset % 4, data); + } +} + +fn shift_and_mask(value: u32, offset: usize, mask: u32) -> usize { + ((value >> offset) & mask) as usize +} + +// Parse the MMIO address offset to a (bus, device, function, register) tuple. +// See section 7.2.2 PCI Express Enhanced Configuration Access Mechanism (ECAM) +// from the Pci Express Base Specification Revision 5.0 Version 1.0. +fn parse_mmio_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 20; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 15; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 12; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3ff; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} + +// Parse the CONFIG_ADDRESS register to a (bus, device, function, register) tuple. +fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 16; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 11; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 8; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3f; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} diff --git a/src/vmm/src/interrupt.rs b/src/vmm/src/interrupt.rs new file mode 100644 index 00000000000..6b82a6cfca7 --- /dev/null +++ b/src/vmm/src/interrupt.rs @@ -0,0 +1,431 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +// + +// use devices::interrupt_controller::InterruptController; +// use hypervisor::IrqRoutingEntry; +use std::collections::HashMap; +use std::io; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex}; +use vm_device::interrupt::{ + InterruptIndex, InterruptManager, InterruptSourceConfig, InterruptSourceGroup, MsiIrqGroupConfig, +}; +use kvm_ioctls::{VmFd}; + +/// Reuse std::io::Result to simplify interoperability among crates. +pub type Result = std::io::Result; + +struct InterruptRoute { + gsi: u32, + irq_fd: EventFd, + registered: AtomicBool, +} + +impl InterruptRoute { + pub fn new(allocator: &mut SystemAllocator) -> Result { + let irq_fd = EventFd::new(libc::EFD_NONBLOCK)?; + let gsi = allocator + .allocate_gsi() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "Failed allocating new GSI"))?; + + Ok(InterruptRoute { + gsi, + irq_fd, + registered: AtomicBool::new(false), + }) + } + + pub fn enable(&self, vm: &VmFd) -> Result<()> { + if !self.registered.load(Ordering::Acquire) { + vm.register_irqfd(&self.irq_fd, self.gsi).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("Failed registering irq_fd: {}", e), + ) + })?; + + // Update internals to track the irq_fd as "registered". + self.registered.store(true, Ordering::Release); + } + + Ok(()) + } + + pub fn disable(&self, vm: &VmFd) -> Result<()> { + if self.registered.load(Ordering::Acquire) { + vm.unregister_irqfd(&self.irq_fd, self.gsi).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("Failed unregistering irq_fd: {}", e), + ) + })?; + + // Update internals to track the irq_fd as "unregistered". + self.registered.store(false, Ordering::Release); + } + + Ok(()) + } + + pub fn trigger(&self) -> Result<()> { + self.irq_fd.write(1) + } + + pub fn notifier(&self) -> Option { + Some( + self.irq_fd + .try_clone() + .expect("Failed cloning interrupt's EventFd"), + ) + } +} + +pub struct RoutingEntry { + route: IrqRoutingEntry, + masked: bool, +} + +pub struct MsiInterruptGroup { + vm: Arc>, + gsi_msi_routes: Arc>>>, + irq_routes: HashMap, +} + +use kvm_bindings::KVM_IRQCHIP_IOAPIC; +use vm_system_allocator::SystemAllocator; + +impl MsiInterruptGroup { + fn set_gsi_routes(&self, routes: &HashMap>) -> Result<()> { + let mut entry_vec: Vec = Vec::new(); + + for i in 0..24 { + let mut kvm_route = kvm_irq_routing_entry { + gsi: i, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + + kvm_route.u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC; + kvm_route.u.irqchip.pin = i; + + entry_vec.push(kvm_route); + } + + for (_, entry) in routes.iter() { + if entry.masked { + continue; + } + entry_vec.push(entry.route); + } + + + let mut irq_routing = + vec_with_array_field::(entry_vec.len()); + irq_routing[0].nr = entry_vec.len() as u32; + irq_routing[0].flags = 0; + + unsafe { + let entries_slice: &mut [kvm_irq_routing_entry] = + irq_routing[0].entries.as_mut_slice(entry_vec.len()); + entries_slice.copy_from_slice(&entry_vec); + } + + self.vm.lock().expect("Poisoned VmFd lock").set_gsi_routing(&irq_routing[0]).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("Failed setting GSI routing: {}", e), + ) + }) + } +} + +impl MsiInterruptGroup { + fn new( + vm: Arc>, + gsi_msi_routes: Arc>>>, + irq_routes: HashMap, + ) -> Self { + MsiInterruptGroup { + vm, + gsi_msi_routes, + irq_routes, + } + } +} + +impl InterruptSourceGroup for MsiInterruptGroup { + fn enable(&self) -> Result<()> { + for (_, route) in self.irq_routes.iter() { + route.enable(&self.vm.lock().expect("Poisoned lock"))?; + } + + Ok(()) + } + + fn disable(&self) -> Result<()> { + for (_, route) in self.irq_routes.iter() { + route.disable(&self.vm.lock().expect("Poisoned lock"))?; + } + + Ok(()) + } + + fn trigger(&self, index: InterruptIndex) -> Result<()> { + if let Some(route) = self.irq_routes.get(&index) { + return route.trigger(); + } + + Err(io::Error::new( + io::ErrorKind::Other, + format!("trigger: Invalid interrupt index {}", index), + )) + } + + fn notifier(&self, index: InterruptIndex) -> Option { + if let Some(route) = self.irq_routes.get(&index) { + return route.notifier(); + } + + None + } + + fn update(&self, index: InterruptIndex, config: InterruptSourceConfig) -> Result<()> { + if let Some(route) = self.irq_routes.get(&index) { + let entry = RoutingEntry::<_>::make_entry(route.gsi, &config)?; + let mut routes = self.gsi_msi_routes.lock().unwrap(); + routes.insert(route.gsi, *entry); + return self.set_gsi_routes(&routes); + } + + Err(io::Error::new( + io::ErrorKind::Other, + format!("update: Invalid interrupt index {}", index), + )) + } + + fn mask(&self, index: InterruptIndex) -> Result<()> { + if let Some(route) = self.irq_routes.get(&index) { + let mut routes = self.gsi_msi_routes.lock().unwrap(); + if let Some(entry) = routes.get_mut(&route.gsi) { + entry.masked = true; + } else { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("mask: No existing route for interrupt index {}", index), + )); + } + self.set_gsi_routes(&routes)?; + return route.disable(&self.vm.lock().expect("Poisoned lock")); + } + + Err(io::Error::new( + io::ErrorKind::Other, + format!("mask: Invalid interrupt index {}", index), + )) + } + + fn unmask(&self, index: InterruptIndex) -> Result<()> { + if let Some(route) = self.irq_routes.get(&index) { + let mut routes = self.gsi_msi_routes.lock().unwrap(); + if let Some(entry) = routes.get_mut(&route.gsi) { + entry.masked = false; + } else { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("mask: No existing route for interrupt index {}", index), + )); + } + self.set_gsi_routes(&routes)?; + return route.enable(&&self.vm.lock().expect("Poisoned lock")); + } + + Err(io::Error::new( + io::ErrorKind::Other, + format!("unmask: Invalid interrupt index {}", index), + )) + } +} + +pub struct MsiInterruptManager { + allocator: Arc>, + vm: Arc>, + gsi_msi_routes: Arc>>>, +} + +impl MsiInterruptManager { + pub fn new(allocator: Arc>, vm: Arc>) -> Self { + // Create a shared list of GSI that can be shared through all PCI + // devices. This way, we can maintain the full list of used GSI, + // preventing one device from overriding interrupts setting from + // another one. + let gsi_msi_routes = Arc::new(Mutex::new(HashMap::new())); + + MsiInterruptManager { + allocator, + vm, + gsi_msi_routes, + } + } +} + +impl InterruptManager for MsiInterruptManager { + type GroupConfig = MsiIrqGroupConfig; + + fn create_group( + &self, + config: Self::GroupConfig, + ) -> Result>> { + let mut allocator = self.allocator.lock().unwrap(); + let mut irq_routes: HashMap = + HashMap::with_capacity(config.count as usize); + for i in config.base..config.base + config.count { + irq_routes.insert(i, InterruptRoute::new(&mut allocator)?); + } + + Ok(Arc::new(Box::new(MsiInterruptGroup::new( + self.vm.clone(), + self.gsi_msi_routes.clone(), + irq_routes, + )))) + } + + fn destroy_group(&self, _group: Arc>) -> Result<()> { + Ok(()) + } +} + +use super::*; +use kvm_bindings::KVM_MSI_VALID_DEVID; +use kvm_bindings::{kvm_irq_routing_entry, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI}; + +type KvmRoutingEntry = RoutingEntry; +pub type KvmMsiInterruptManager = MsiInterruptManager; + +impl KvmRoutingEntry { + pub fn make_entry( + gsi: u32, + config: &InterruptSourceConfig, + ) -> Result> { + if let InterruptSourceConfig::MsiIrq(cfg) = &config { + let mut kvm_route = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_MSI, + ..Default::default() + }; + + kvm_route.u.msi.address_lo = cfg.low_addr; + kvm_route.u.msi.address_hi = cfg.high_addr; + kvm_route.u.msi.data = cfg.data; + + kvm_route.flags = KVM_MSI_VALID_DEVID; + kvm_route.u.msi.__bindgen_anon_1.devid = cfg.devid; + + let kvm_entry = KvmRoutingEntry { + route: kvm_route, + masked: false, + }; + + return Ok(Box::new(kvm_entry)); + } else if let InterruptSourceConfig::LegacyIrq(cfg) = &config { + let mut kvm_route = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + kvm_route.u.irqchip.irqchip = cfg.irqchip; + kvm_route.u.irqchip.pin = cfg.pin; + let kvm_entry = KvmRoutingEntry { + route: kvm_route, + masked: false, + }; + + return Ok(Box::new(kvm_entry)); + } + + Err(io::Error::new( + io::ErrorKind::Other, + "Interrupt config type not supported", + )) + } +} + +#[cfg(target_arch = "aarch64")] +#[cfg(test)] +mod tests { + use arch::aarch64::gic::kvm::{create_gic, save_pending_tables}; + use arch::aarch64::gic::{ + get_dist_regs, get_icc_regs, get_redist_regs, set_dist_regs, set_icc_regs, set_redist_regs, + }; + + #[test] + fn test_create_gic() { + let hv = hypervisor::new().unwrap(); + let vm = hv.create_vm().unwrap(); + + assert!(create_gic(&vm, 1).is_ok()); + } + + #[test] + fn test_get_set_dist_regs() { + let hv = hypervisor::new().unwrap(); + let vm = hv.create_vm().unwrap(); + let _ = vm.create_vcpu(0, None).unwrap(); + let gic = create_gic(&vm, 1).expect("Cannot create gic"); + + let res = get_dist_regs(gic.device()); + assert!(res.is_ok()); + let state = res.unwrap(); + assert_eq!(state.len(), 649); + + let res = set_dist_regs(gic.device(), &state); + assert!(res.is_ok()); + } + + #[test] + fn test_get_set_redist_regs() { + let hv = hypervisor::new().unwrap(); + let vm = hv.create_vm().unwrap(); + let _ = vm.create_vcpu(0, None).unwrap(); + let gic = create_gic(&vm, 1).expect("Cannot create gic"); + + let mut gicr_typer = Vec::new(); + gicr_typer.push(123); + let res = get_redist_regs(gic.device(), &gicr_typer); + assert!(res.is_ok()); + let state = res.unwrap(); + println!("{}", state.len()); + assert!(state.len() == 24); + + assert!(set_redist_regs(gic.device(), &gicr_typer, &state).is_ok()); + } + + #[test] + fn test_get_set_icc_regs() { + let hv = hypervisor::new().unwrap(); + let vm = hv.create_vm().unwrap(); + let _ = vm.create_vcpu(0, None).unwrap(); + let gic = create_gic(&vm, 1).expect("Cannot create gic"); + + let mut gicr_typer = Vec::new(); + gicr_typer.push(123); + let res = get_icc_regs(gic.device(), &gicr_typer); + assert!(res.is_ok()); + let state = res.unwrap(); + println!("{}", state.len()); + assert!(state.len() == 9); + + assert!(set_icc_regs(gic.device(), &gicr_typer, &state).is_ok()); + } + + #[test] + fn test_save_pending_tables() { + let hv = hypervisor::new().unwrap(); + let vm = hv.create_vm().unwrap(); + let _ = vm.create_vcpu(0, None).unwrap(); + let gic = create_gic(&vm, 1).expect("Cannot create gic"); + + assert!(save_pending_tables(gic.device()).is_ok()); + } +} diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index c80f004e789..7825de4459a 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -110,6 +110,7 @@ pub mod utils; pub mod vmm_config; /// Module with virtual state structs. pub mod vstate; +pub mod interrupt; use std::collections::HashMap; use std::io; @@ -153,6 +154,7 @@ use crate::vstate::memory::{ use crate::vstate::vcpu::VcpuState; pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse}; pub use crate::vstate::vm::Vm; +use kvm_bindings::{kvm_irq_routing, kvm_irq_routing_entry as IrqRoutingEntry}; /// Shorthand type for the EventManager flavour used by Firecracker. pub type EventManager = BaseEventManager>>; @@ -857,6 +859,38 @@ impl Vmm { } } +// Returns a `Vec` with a size in bytes at least as large as `size_in_bytes`. +fn vec_with_size_in_bytes(size_in_bytes: usize) -> Vec { + let rounded_size = (size_in_bytes + size_of::() - 1) / size_of::(); + let mut v = Vec::with_capacity(rounded_size); + v.resize_with(rounded_size, T::default); + v +} + +use std::mem::size_of; +// The kvm API has many structs that resemble the following `Foo` structure: +// +// ``` +// #[repr(C)] +// struct Foo { +// some_data: u32 +// entries: __IncompleteArrayField<__u32>, +// } +// ``` +// +// In order to allocate such a structure, `size_of::()` would be too small because it would not +// include any space for `entries`. To make the allocation large enough while still being aligned +// for `Foo`, a `Vec` is created. Only the first element of `Vec` would actually be used +// as a `Foo`. The remaining memory in the `Vec` is for `entries`, which must be contiguous +// with `Foo`. This function is used to make the `Vec` with enough space for `count` entries. +/// Helper function to create Vec of specific size. +pub fn vec_with_array_field(count: usize) -> Vec { + let element_space = count * size_of::(); + let vec_size_bytes = size_of::() + element_space; + vec_with_size_in_bytes(vec_size_bytes) +} + + /// Process the content of the MPIDR_EL1 register in order to be able to pass it to KVM /// /// The kernel expects to find the four affinity levels of the MPIDR in the first 32 bits of the diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs index 43a0946931e..b3673b2d5b2 100644 --- a/src/vmm/src/vstate/vcpu/mod.rs +++ b/src/vmm/src/vstate/vcpu/mod.rs @@ -521,7 +521,9 @@ fn handle_kvm_exit( VcpuExit::MmioRead(addr, data) => { if let Some(mmio_bus) = &peripherals.mmio_bus { let _metric = METRICS.vcpu.exit_mmio_read_agg.record_latency_metrics(); - mmio_bus.read(addr, data); + if !(mmio_bus.read(addr, data)) { + error!("Unhandled mmio read at {:x}", addr); + } METRICS.vcpu.exit_mmio_read.inc(); } Ok(VcpuEmulation::Handled) @@ -529,7 +531,9 @@ fn handle_kvm_exit( VcpuExit::MmioWrite(addr, data) => { if let Some(mmio_bus) = &peripherals.mmio_bus { let _metric = METRICS.vcpu.exit_mmio_write_agg.record_latency_metrics(); - mmio_bus.write(addr, data); + if !mmio_bus.write(addr, data) { + error!("Unhandled mmio write at {:x}", addr); + } METRICS.vcpu.exit_mmio_write.inc(); } Ok(VcpuEmulation::Handled) diff --git a/src/vmm/src/vstate/vcpu/x86_64.rs b/src/vmm/src/vstate/vcpu/x86_64.rs index 6fee3933435..ec1be819e1b 100644 --- a/src/vmm/src/vstate/vcpu/x86_64.rs +++ b/src/vmm/src/vstate/vcpu/x86_64.rs @@ -634,14 +634,15 @@ impl Peripherals { VcpuExit::IoIn(addr, data) => { if let Some(pio_bus) = &self.pio_bus { let _metric = METRICS.vcpu.exit_io_in_agg.record_latency_metrics(); - pio_bus.read(u64::from(addr), data); + if !pio_bus.read(u64::from(addr), data) { + error!("Unhandled PIO read {:x}", addr); + } METRICS.vcpu.exit_io_in.inc(); } Ok(VcpuEmulation::Handled) } VcpuExit::IoOut(addr, data) => { if let Some(pio_bus) = &self.pio_bus { - let _metric = METRICS.vcpu.exit_io_out_agg.record_latency_metrics(); pio_bus.write(u64::from(addr), data); METRICS.vcpu.exit_io_out.inc(); } @@ -1171,7 +1172,7 @@ mod tests { // Regression test for #4666 let kvm = Kvm::new().unwrap(); - let vm = Vm::new(Vec::new()).unwrap(); + let (vm, _) = Vm::new(Vec::new()).unwrap(); let vcpu = KvmVcpu::new(0, &vm).unwrap(); // The list of supported MSR indices, in the order they were returned by KVM diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 0f72abcf68f..98e9dac7c2d 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -17,6 +17,7 @@ use kvm_bindings::{ use kvm_bindings::{kvm_userspace_memory_region, KVM_API_VERSION, KVM_MEM_LOG_DIRTY_PAGES}; use kvm_ioctls::{Kvm, VmFd}; use serde::{Deserialize, Serialize}; +use std::os::unix::io::AsRawFd; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::GICDevice; @@ -132,7 +133,7 @@ pub struct Vm { /// Contains Vm functions that are usable across CPU architectures impl Vm { /// Constructs a new `Vm` using the given `Kvm` instance. - pub fn new(kvm_cap_modifiers: Vec) -> Result { + pub fn new(kvm_cap_modifiers: Vec) -> Result<(Self, VmFd), VmError> { let kvm = Kvm::new().map_err(VmError::Kvm)?; // Check that KVM has the correct version. @@ -150,16 +151,10 @@ impl Vm { // Create fd for interacting with kvm-vm specific functions. let vm_fd = kvm.create_vm().map_err(VmError::VmFd)?; - #[cfg(target_arch = "aarch64")] - { - Ok(Vm { - fd: vm_fd, - max_memslots, - kvm_cap_modifiers, - irqchip_handle: None, - }) - } - + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let supported_cpuid = kvm + .get_supported_cpuid(KVM_MAX_CPUID_ENTRIES) + .map_err(VmError::VmFd)?; #[cfg(target_arch = "x86_64")] { let supported_cpuid = kvm @@ -167,13 +162,19 @@ impl Vm { .map_err(VmError::VmFd)?; let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm)?; - Ok(Vm { - fd: vm_fd, - max_memslots, - kvm_cap_modifiers, - supported_cpuid, - msrs_to_save, - }) + let rawfd = unsafe { libc::dup(vm_fd.as_raw_fd()) }; + let extra_fd = unsafe { kvm.create_vmfd_from_rawfd(rawfd).unwrap() }; + + Ok(( + Vm { + fd: vm_fd, + max_memslots, + kvm_cap_modifiers, + supported_cpuid, + msrs_to_save, + }, + extra_fd + )) } } @@ -468,6 +469,7 @@ impl fmt::Debug for VmState { #[cfg(test)] pub(crate) mod tests { use super::*; + use crate::cpu_config::templates::KvmCapability; #[cfg(target_arch = "x86_64")] use crate::snapshot::Snapshot; use crate::test_utils::single_region_mem; @@ -477,7 +479,7 @@ pub(crate) mod tests { pub(crate) fn setup_vm(mem_size: usize) -> (Vm, GuestMemoryMmap) { let gm = single_region_mem(mem_size); - let vm = Vm::new(vec![]).expect("Cannot create new vm"); + let (vm, _) = Vm::new(vec![]).expect("Cannot create new vm"); vm.memory_init(&gm, false).unwrap(); (vm, gm) @@ -509,7 +511,7 @@ pub(crate) mod tests { #[test] fn test_vm_memory_init() { - let vm = Vm::new(vec![]).expect("Cannot create new vm"); + let (vm, _) = Vm::new(vec![]).expect("Cannot create new vm"); // Create valid memory region and test that the initialization is successful. let gm = single_region_mem(0x1000); @@ -519,7 +521,7 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] #[test] fn test_vm_save_restore_state() { - let vm = Vm::new(vec![]).expect("new vm failed"); + let (vm, _) = Vm::new(vec![]).expect("new vm failed"); // Irqchips, clock and pitstate are not configured so trying to save state should fail. vm.save_state().unwrap_err(); @@ -587,7 +589,7 @@ pub(crate) mod tests { #[test] fn test_set_kvm_memory_regions() { - let vm = Vm::new(vec![]).expect("Cannot create new vm"); + let (vm, _) = Vm::new(vec![]).expect("Cannot create new vm"); let gm = single_region_mem(0x1000); let res = vm.set_kvm_memory_regions(&gm, false); From 57d71f566791acfe2cc8a5524bac79c949213a4b Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 15 Oct 2024 14:31:30 +0100 Subject: [PATCH 02/22] enable vfio --- src/vmm/src/builder.rs | 28 ++++++++++++++-------------- src/vmm/src/devices/bus.rs | 2 ++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index a7bd7727ec8..e37c6de0ddf 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -375,7 +375,7 @@ fn create_vmm_and_vcpus( // For x86_64 we need to create the interrupt controller before calling `KVM_CREATE_VCPUS` // while on aarch64 we need to do it the other way around. #[cfg(target_arch = "x86_64")] - let (mut vcpus, pio_device_manager) = { + let (vcpus, mut pio_device_manager) = { setup_interrupt_controller(&mut vm)?; let vcpus = create_vcpus(&vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; @@ -407,20 +407,20 @@ fn create_vmm_and_vcpus( }; // Create passthru device for a GPU. - // let device_fd = create_passthrough_device(vm.fd()); - - // add_vfio_device( - // Arc::clone(&vm_fd), - // device_fd, - // Arc::clone(&pci_bus), - // &mut mmio_device_manager, - // &mut pio_device_manager, - // Arc::clone(&msi_interrupt_manager), - // guest_memory.clone(), - // Arc::clone(&allocator) - // ); + let device_fd = create_passthrough_device(vm.fd()); + + add_vfio_device( + Arc::clone(&vm_fd), + device_fd, + Arc::clone(&pci_bus), + &mut mmio_device_manager, + &mut pio_device_manager, + Arc::clone(&msi_interrupt_manager), + guest_memory.clone(), + Arc::clone(&allocator) + ); + - vcpus = create_vcpus(&vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; let pci_config_mmio = Arc::new(Mutex::new(BusDevice::MmioPciBus(PciConfigMmio::new(Arc::clone(&pci_bus))))); mmio_device_manager .register_pci_bus(pci_config_mmio) diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index 4a62e6e7630..25e1d239694 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -186,12 +186,14 @@ impl BusDevice { pub fn pci_device_ref(&self) -> Option<&dyn PciDevice> { match self { Self::VfioPciDevice(x) => Some(x), + Self::PciRoot(x) => Some(x), _ => None, } } pub fn pci_device_mut(&mut self) -> Option<&mut dyn PciDevice> { match self { Self::VfioPciDevice(x) => Some(x), + Self::PciRoot(x) => Some(x), _ => None, } } From 1fd997112a8ea0fba25b978dd29a9b6032131f6f Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 15 Oct 2024 17:39:17 +0100 Subject: [PATCH 03/22] try to rebase onto latest cloud-hypervisor crates --- Cargo.lock | 50 +- src/pci/Cargo.toml | 22 +- src/pci/src/bus.rs | 161 +- src/pci/src/configuration.rs | 611 +++-- src/pci/src/device.rs | 69 +- src/pci/src/lib.rs | 160 +- src/pci/src/msi.rs | 100 +- src/pci/src/msix.rs | 209 +- src/pci/src/vfio.rs | 2173 +++++++++++------ src/vm-device/Cargo.toml | 18 +- src/vm-device/src/bus.rs | 135 +- src/vm-device/src/dma_mapping/mod.rs | 4 +- src/vm-device/src/dma_mapping/vfio.rs | 73 - src/vm-device/src/interrupt/mod.rs | 34 +- src/vm-device/src/lib.rs | 30 +- src/vm-system-allocator/Cargo.toml | 6 +- src/vm-system-allocator/src/address.rs | 16 +- .../src/arch/aarch64/layout.rs | 84 - .../src/arch/aarch64/mod.rs | 11 - src/vm-system-allocator/src/arch/mod.rs | 19 - .../src/arch/x86_64/layout.rs | 9 - .../src/arch/x86_64/mod.rs | 9 - src/vm-system-allocator/src/gsi.rs | 8 +- src/vm-system-allocator/src/lib.rs | 6 +- src/vm-system-allocator/src/page_size.rs | 38 + src/vm-system-allocator/src/system.rs | 74 +- src/vmm/src/builder.rs | 51 +- src/vmm/src/devices/pci.rs | 77 +- src/vmm/src/interrupt.rs | 95 +- 29 files changed, 2780 insertions(+), 1572 deletions(-) delete mode 100644 src/vm-device/src/dma_mapping/vfio.rs delete mode 100644 src/vm-system-allocator/src/arch/aarch64/layout.rs delete mode 100644 src/vm-system-allocator/src/arch/aarch64/mod.rs delete mode 100644 src/vm-system-allocator/src/arch/mod.rs delete mode 100644 src/vm-system-allocator/src/arch/x86_64/layout.rs delete mode 100644 src/vm-system-allocator/src/arch/x86_64/mod.rs create mode 100644 src/vm-system-allocator/src/page_size.rs diff --git a/Cargo.lock b/Cargo.lock index 5a643c7c622..7931eeaabda 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -975,6 +975,30 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1" +[[package]] +name = "mshv-bindings" +version = "0.3.0" +source = "git+https://github.com/rust-vmm/mshv?tag=v0.3.0#fda05380ea4c68b807996299d5ffb2854ca6d01d" +dependencies = [ + "libc", + "num_enum", + "serde", + "serde_derive", + "vmm-sys-util", + "zerocopy 0.7.35", +] + +[[package]] +name = "mshv-ioctls" +version = "0.3.0" +source = "git+https://github.com/rust-vmm/mshv?tag=v0.3.0#fda05380ea4c68b807996299d5ffb2854ca6d01d" +dependencies = [ + "libc", + "mshv-bindings", + "thiserror", + "vmm-sys-util", +] + [[package]] name = "nix" version = "0.27.1" @@ -1018,6 +1042,26 @@ dependencies = [ "libm", ] +[[package]] +name = "num_enum" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179" +dependencies = [ + "num_enum_derive", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "once_cell" version = "1.20.2" @@ -1052,6 +1096,8 @@ dependencies = [ "kvm-ioctls", "libc", "log", + "serde", + "thiserror", "vfio-bindings 0.2.0", "vfio-ioctls", "vm-device", @@ -1575,6 +1621,8 @@ dependencies = [ "kvm-ioctls", "libc", "log", + "mshv-bindings", + "mshv-ioctls", "thiserror", "vfio-bindings 0.4.0", "vm-memory", @@ -1610,8 +1658,6 @@ version = "0.1.0" dependencies = [ "anyhow", "serde", - "serde_derive", - "serde_json", "thiserror", "vfio-ioctls", "vm-memory", diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml index 23b0403a53b..72698500ca4 100644 --- a/src/pci/Cargo.toml +++ b/src/pci/Cargo.toml @@ -1,22 +1,26 @@ [package] +authors = ["Samuel Ortiz "] +edition = "2021" name = "pci" version = "0.1.0" -authors = ["Samuel Ortiz "] -edition = "2018" + +[features] +default = [] +kvm = ["vfio-ioctls/kvm"] +mshv = ["vfio-ioctls/mshv"] [dependencies] -anyhow = "1.0" -byteorder = "1.4.3" +anyhow = "1.0.87" +byteorder = "1.5.0" vmm-sys-util = ">=0.3.1" -libc = ">=0.2.39" +libc = "0.2.158" log = { version = "0.4.22", features = ["std", "serde"] } vm-memory = { version = "0.15.0", features = ["backend-mmap", "backend-bitmap"] } vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", branch = "main" } +serde = { version = "1.0.208", features = ["derive"] } +thiserror = "1.0.62" +vfio-bindings = { version = "0.2.0", features = ["fam-wrappers"] } kvm-bindings = { version = "0.9.1", features = ["fam-wrappers"] } kvm-ioctls = "0.18.0" vm-device = { path = "../vm-device"} vm-system-allocator = { path = "../vm-system-allocator" } - -[dependencies.vfio-bindings] -version = "0.2.0" -features = ["fam-wrappers"] \ No newline at end of file diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs index c454c4adc30..906d8f683dc 100644 --- a/src/pci/src/bus.rs +++ b/src/pci/src/bus.rs @@ -1,17 +1,22 @@ // Copyright 2018 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use crate::configuration::{ - PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, -}; -use crate::device::PciDevice; -use byteorder::{ByteOrder, LittleEndian}; -use log::error; use std::any::Any; use std::collections::HashMap; +use std::ops::DerefMut; use std::sync::{Arc, Barrier, Mutex}; -use vm_memory::{Address, GuestAddress, GuestUsize}; + +use byteorder::{ByteOrder, LittleEndian}; +use vm_device::{Bus, BusDevice, BusDeviceSync}; + +use crate::configuration::{ + PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, +}; +use crate::device::{DeviceRelocation, Error as PciDeviceError, PciDevice}; +use crate::PciBarConfiguration; const VENDOR_ID_INTEL: u16 = 0x8086; const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; @@ -20,8 +25,14 @@ const NUM_DEVICE_IDS: usize = 32; /// Errors for device manager. #[derive(Debug)] pub enum PciRootError { + /// Could not allocate device address space for the device. + AllocateDeviceAddrs(PciDeviceError), /// Could not allocate an IRQ number. AllocateIrq, + /// Could not add a device to the port io bus. + PioInsert(vm_device::BusError), + /// Could not add a device to the mmio bus. + MmioInsert(vm_device::BusError), /// Could not find an available device slot on the PCI bus. NoPciDeviceSlotAvailable, /// Invalid PCI device identifier provided. @@ -55,6 +66,7 @@ impl PciRoot { 0, 0, None, + None, ), } } @@ -81,17 +93,22 @@ impl PciDevice for PciRoot { fn as_any(&mut self) -> &mut dyn Any { self } + + fn id(&self) -> Option { + None + } } pub struct PciBus { /// Devices attached to this bus. /// Device 0 is host bridge. devices: HashMap>>, + device_reloc: Arc, device_ids: Vec, } impl PciBus { - pub fn new(pci_root: PciRoot) -> Self { + pub fn new(pci_root: PciRoot, device_reloc: Arc) -> Self { let mut devices: HashMap>> = HashMap::new(); let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; @@ -100,42 +117,40 @@ impl PciBus { PciBus { devices, + device_reloc, device_ids, } } pub fn register_mapping( &self, - dev: Arc>, - io_bus: &mut Bus, - mmio_bus: &mut Bus, - bars: Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, + dev: Arc, + #[cfg(target_arch = "x86_64")] io_bus: &Bus, + mmio_bus: &Bus, + bars: Vec, ) -> Result<()> { - for (address, size, type_) in bars { - match type_ { + for bar in bars { + match bar.region_type() { PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] io_bus - .insert(dev.clone(), address.raw_value(), size) - .unwrap(); - error!("cannot register bus mappings {:x} {:x} IO", address.0, size); + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(PciRootError::PioInsert)?; + #[cfg(not(target_arch = "x86_64"))] + error!("I/O region is not supported"); } PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { - error!("Registering bus mappings {:x} {:x}", address.0, size); mmio_bus - .insert(dev.clone(), address.raw_value(), size) - .unwrap(); + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(PciRootError::MmioInsert)?; } } } Ok(()) } - pub fn add_device( - &mut self, - pci_device_bdf: u32, - device: Arc>, - ) -> Result<()> { - self.devices.insert(pci_device_bdf >> 3, device); + pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { + self.devices.insert(device_id, device); Ok(()) } @@ -187,8 +202,8 @@ pub struct PciConfigIo { impl PciConfigIo { pub fn new(pci_bus: Arc>) -> Self { PciConfigIo { - pci_bus, config_address: 0, + pci_bus, } } @@ -201,11 +216,6 @@ impl PciConfigIo { let (bus, device, function, register) = parse_io_config_address(self.config_address & !0x8000_0000); - error!( - "config space read {}:{}:{} reg {}", - bus, device, function, register - ); - // Only support one bus. if bus != 0 { return 0xffff_ffff; @@ -217,6 +227,7 @@ impl PciConfigIo { } self.pci_bus + .as_ref() .lock() .unwrap() .devices @@ -244,28 +255,27 @@ impl PciConfigIo { return None; } - let pci_bus = self.pci_bus.lock().unwrap(); + let pci_bus = self.pci_bus.as_ref().lock().unwrap(); if let Some(d) = pci_bus.devices.get(&(device as u32)) { let mut device = d.lock().unwrap(); + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. if let Some(params) = device.detect_bar_reprogramming(register, data) { - // if let Err(e) = pci_bus.device_reloc.move_bar( - // params.old_base, - // params.new_base, - // params.len, - // device.deref_mut(), - // params.region_type, - // ) { - // error!( - // "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", - // e, params.old_base, params.new_base, params.len - // ); - // } - error!( - "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", - params.old_base, params.new_base, params.len - ); + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } } + // Update the register value device.write_config_register(register, offset, data) } else { @@ -294,7 +304,7 @@ impl PciConfigIo { } impl BusDevice for PciConfigIo { - fn read(&mut self, _: u64, offset: u64, data: &mut [u8]) { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { // `offset` is relative to 0xcf8 let value = match offset { 0..=3 => self.config_address, @@ -316,16 +326,15 @@ impl BusDevice for PciConfigIo { } } - fn write(&mut self, _: u64, offset: u64, data: &[u8]) { + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { // `offset` is relative to 0xcf8 match offset { o @ 0..=3 => { self.set_config_address(o, data); + None } - o @ 4..=7 => { - self.config_space_write(o - 4, data); - } - _ => {} + o @ 4..=7 => self.config_space_write(o - 4, data), + _ => None, } } } @@ -374,23 +383,21 @@ impl PciConfigMmio { if let Some(d) = pci_bus.devices.get(&(device as u32)) { let mut device = d.lock().unwrap(); + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. if let Some(params) = device.detect_bar_reprogramming(register, data) { - // if let Err(e) = pci_bus.device_reloc.move_bar( - // params.old_base, - // params.new_base, - // params.len, - // device.deref_mut(), - // params.region_type, - // ) { - // error!( - // "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", - // e, params.old_base, params.new_base, params.len - // ); - // } - error!( - "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", - params.old_base, params.new_base, params.len - ); + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } } // Update the register value @@ -400,11 +407,11 @@ impl PciConfigMmio { } impl BusDevice for PciConfigMmio { - fn read(&mut self, _: u64, offset: u64, data: &mut [u8]) { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { // Only allow reads to the register boundary. let start = offset as usize % 4; let end = start + data.len(); - if end > 4 || offset > u64::from(u32::max_value()) { + if end > 4 || offset > u64::from(u32::MAX) { for d in data { *d = 0xff; } @@ -417,11 +424,13 @@ impl BusDevice for PciConfigMmio { } } - fn bus_write(&mut self, _: u64, offset: u64, data: &[u8]) { - if offset > u64::from(u32::max_value()) { - return; + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + if offset > u64::from(u32::MAX) { + return None; } self.config_space_write(offset as u32, offset % 4, data); + + None } } diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 48753c1ef9a..38210a37bc3 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -1,12 +1,18 @@ // Copyright 2018 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use crate::{BarReprogrammingParams, MsixConfig, PciInterruptPin}; -use byteorder::{ByteOrder, LittleEndian}; use std::fmt::{self, Display}; use std::sync::{Arc, Mutex}; -use log::{debug, warn}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_device::PciBarType; + +use crate::device::BarReprogrammingParams; +use crate::{MsixConfig, PciInterruptPin}; // The number of 32bit registers in the config space, 4096 bytes. const NUM_CONFIGURATION_REGISTERS: usize = 1024; @@ -15,16 +21,21 @@ const STATUS_REG: usize = 1; const STATUS_REG_CAPABILITIES_USED_MASK: u32 = 0x0010_0000; const BAR0_REG: usize = 4; const ROM_BAR_REG: usize = 12; +const ROM_BAR_IDX: usize = 6; const BAR_IO_ADDR_MASK: u32 = 0xffff_fffc; const BAR_MEM_ADDR_MASK: u32 = 0xffff_fff0; const ROM_BAR_ADDR_MASK: u32 = 0xffff_f800; +const MSI_CAPABILITY_REGISTER_MASK: u32 = 0x0071_0000; +const MSIX_CAPABILITY_REGISTER_MASK: u32 = 0xc000_0000; const NUM_BAR_REGS: usize = 6; const CAPABILITY_LIST_HEAD_OFFSET: usize = 0x34; -pub const FIRST_CAPABILITY_OFFSET: usize = 0x40; -pub const CAPABILITY_MAX_OFFSET: usize = 192; +const FIRST_CAPABILITY_OFFSET: usize = 0x40; +const CAPABILITY_MAX_OFFSET: usize = 192; const INTERRUPT_LINE_PIN_REG: usize = 15; +pub const PCI_CONFIGURATION_ID: &str = "pci_configuration"; + /// Represents the types of PCI headers allowed in the configuration registers. #[derive(Copy, Clone)] pub enum PciHeaderType { @@ -170,8 +181,10 @@ impl PciSubclass for PciNetworkControllerSubclass { } } -/// A PCI class programming interface. Each combination of `PciClassCode` and -/// `PciSubclass` can specify a set of register-level programming interfaces. +/// Trait to define a PCI class programming interface +/// +/// Each combination of `PciClassCode` and `PciSubclass` can specify a +/// set of register-level programming interfaces. /// This trait is implemented by each programming interface. /// It allows use of a trait object to generate configurations. pub trait PciProgrammingInterface { @@ -180,10 +193,10 @@ pub trait PciProgrammingInterface { } /// Types of PCI capabilities. -#[derive(PartialEq, Copy, Clone)] +#[derive(PartialEq, Eq, Copy, Clone)] #[allow(dead_code)] #[allow(non_camel_case_types)] -#[repr(C)] +#[repr(u8)] pub enum PciCapabilityId { ListId = 0, PowerManagement = 0x01, @@ -237,6 +250,118 @@ impl From for PciCapabilityId { } } +/// Types of PCI Express capabilities. +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +#[allow(dead_code)] +#[repr(u16)] +pub enum PciExpressCapabilityId { + NullCapability = 0x0000, + AdvancedErrorReporting = 0x0001, + VirtualChannelMultiFunctionVirtualChannelNotPresent = 0x0002, + DeviceSerialNumber = 0x0003, + PowerBudgeting = 0x0004, + RootComplexLinkDeclaration = 0x0005, + RootComplexInternalLinkControl = 0x0006, + RootComplexEventCollectorEndpointAssociation = 0x0007, + MultiFunctionVirtualChannel = 0x0008, + VirtualChannelMultiFunctionVirtualChannelPresent = 0x0009, + RootComplexRegisterBlock = 0x000a, + VendorSpecificExtendedCapability = 0x000b, + ConfigurationAccessCorrelation = 0x000c, + AccessControlServices = 0x000d, + AlternativeRoutingIdentificationInterpretation = 0x000e, + AddressTranslationServices = 0x000f, + SingleRootIoVirtualization = 0x0010, + DeprecatedMultiRootIoVirtualization = 0x0011, + Multicast = 0x0012, + PageRequestInterface = 0x0013, + ReservedForAmd = 0x0014, + ResizeableBar = 0x0015, + DynamicPowerAllocation = 0x0016, + ThpRequester = 0x0017, + LatencyToleranceReporting = 0x0018, + SecondaryPciExpress = 0x0019, + ProtocolMultiplexing = 0x001a, + ProcessAddressSpaceId = 0x001b, + LnRequester = 0x001c, + DownstreamPortContainment = 0x001d, + L1PmSubstates = 0x001e, + PrecisionTimeMeasurement = 0x001f, + PciExpressOverMphy = 0x0020, + FRSQueueing = 0x0021, + ReadinessTimeReporting = 0x0022, + DesignatedVendorSpecificExtendedCapability = 0x0023, + VfResizeableBar = 0x0024, + DataLinkFeature = 0x0025, + PhysicalLayerSixteenGts = 0x0026, + LaneMarginingAtTheReceiver = 0x0027, + HierarchyId = 0x0028, + NativePcieEnclosureManagement = 0x0029, + PhysicalLayerThirtyTwoGts = 0x002a, + AlternateProtocol = 0x002b, + SystemFirmwareIntermediary = 0x002c, + ShadowFunctions = 0x002d, + DataObjectExchange = 0x002e, + Reserved = 0x002f, + ExtendedCapabilitiesAbsence = 0xffff, +} + +impl From for PciExpressCapabilityId { + fn from(c: u16) -> Self { + match c { + 0x0000 => PciExpressCapabilityId::NullCapability, + 0x0001 => PciExpressCapabilityId::AdvancedErrorReporting, + 0x0002 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelNotPresent, + 0x0003 => PciExpressCapabilityId::DeviceSerialNumber, + 0x0004 => PciExpressCapabilityId::PowerBudgeting, + 0x0005 => PciExpressCapabilityId::RootComplexLinkDeclaration, + 0x0006 => PciExpressCapabilityId::RootComplexInternalLinkControl, + 0x0007 => PciExpressCapabilityId::RootComplexEventCollectorEndpointAssociation, + 0x0008 => PciExpressCapabilityId::MultiFunctionVirtualChannel, + 0x0009 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelPresent, + 0x000a => PciExpressCapabilityId::RootComplexRegisterBlock, + 0x000b => PciExpressCapabilityId::VendorSpecificExtendedCapability, + 0x000c => PciExpressCapabilityId::ConfigurationAccessCorrelation, + 0x000d => PciExpressCapabilityId::AccessControlServices, + 0x000e => PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation, + 0x000f => PciExpressCapabilityId::AddressTranslationServices, + 0x0010 => PciExpressCapabilityId::SingleRootIoVirtualization, + 0x0011 => PciExpressCapabilityId::DeprecatedMultiRootIoVirtualization, + 0x0012 => PciExpressCapabilityId::Multicast, + 0x0013 => PciExpressCapabilityId::PageRequestInterface, + 0x0014 => PciExpressCapabilityId::ReservedForAmd, + 0x0015 => PciExpressCapabilityId::ResizeableBar, + 0x0016 => PciExpressCapabilityId::DynamicPowerAllocation, + 0x0017 => PciExpressCapabilityId::ThpRequester, + 0x0018 => PciExpressCapabilityId::LatencyToleranceReporting, + 0x0019 => PciExpressCapabilityId::SecondaryPciExpress, + 0x001a => PciExpressCapabilityId::ProtocolMultiplexing, + 0x001b => PciExpressCapabilityId::ProcessAddressSpaceId, + 0x001c => PciExpressCapabilityId::LnRequester, + 0x001d => PciExpressCapabilityId::DownstreamPortContainment, + 0x001e => PciExpressCapabilityId::L1PmSubstates, + 0x001f => PciExpressCapabilityId::PrecisionTimeMeasurement, + 0x0020 => PciExpressCapabilityId::PciExpressOverMphy, + 0x0021 => PciExpressCapabilityId::FRSQueueing, + 0x0022 => PciExpressCapabilityId::ReadinessTimeReporting, + 0x0023 => PciExpressCapabilityId::DesignatedVendorSpecificExtendedCapability, + 0x0024 => PciExpressCapabilityId::VfResizeableBar, + 0x0025 => PciExpressCapabilityId::DataLinkFeature, + 0x0026 => PciExpressCapabilityId::PhysicalLayerSixteenGts, + 0x0027 => PciExpressCapabilityId::LaneMarginingAtTheReceiver, + 0x0028 => PciExpressCapabilityId::HierarchyId, + 0x0029 => PciExpressCapabilityId::NativePcieEnclosureManagement, + 0x002a => PciExpressCapabilityId::PhysicalLayerThirtyTwoGts, + 0x002b => PciExpressCapabilityId::AlternateProtocol, + 0x002c => PciExpressCapabilityId::SystemFirmwareIntermediary, + 0x002d => PciExpressCapabilityId::ShadowFunctions, + 0x002e => PciExpressCapabilityId::DataObjectExchange, + 0xffff => PciExpressCapabilityId::ExtendedCapabilitiesAbsence, + _ => PciExpressCapabilityId::Reserved, + } + } +} + /// A PCI capability list. Devices can optionally specify capabilities in their configuration space. pub trait PciCapability { fn bytes(&self) -> &[u8]; @@ -275,7 +400,7 @@ fn decode_64_bits_bar_size(bar_size_hi: u32, bar_size_lo: u32) -> Option { None } -#[derive(Default, Clone, Copy)] +#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] struct PciBar { addr: u32, size: u32, @@ -283,7 +408,20 @@ struct PciBar { r#type: Option, } +#[derive(Serialize, Deserialize)] +pub struct PciConfigurationState { + registers: Vec, + writable_bits: Vec, + bars: Vec, + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, +} + /// Contains the configuration space of a PCI node. +/// /// See the [specification](https://en.wikipedia.org/wiki/PCI_configuration_space). /// The configuration space is accessed with DWORD reads and writes from the guest. pub struct PciConfiguration { @@ -300,24 +438,53 @@ pub struct PciConfiguration { } /// See pci_regs.h in kernel -#[derive(Copy, Clone, PartialEq)] +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] pub enum PciBarRegionType { Memory32BitRegion = 0, IoRegion = 0x01, Memory64BitRegion = 0x04, } +impl From for PciBarRegionType { + fn from(type_: PciBarType) -> Self { + match type_ { + PciBarType::Io => PciBarRegionType::IoRegion, + PciBarType::Mmio32 => PciBarRegionType::Memory32BitRegion, + PciBarType::Mmio64 => PciBarRegionType::Memory64BitRegion, + } + } +} + +impl From for PciBarType { + fn from(val: PciBarRegionType) -> Self { + match val { + PciBarRegionType::IoRegion => PciBarType::Io, + PciBarRegionType::Memory32BitRegion => PciBarType::Mmio32, + PciBarRegionType::Memory64BitRegion => PciBarType::Mmio64, + } + } +} + #[derive(Copy, Clone)] pub enum PciBarPrefetchable { NotPrefetchable = 0, Prefetchable = 0x08, } +impl From for bool { + fn from(val: PciBarPrefetchable) -> Self { + match val { + PciBarPrefetchable::NotPrefetchable => false, + PciBarPrefetchable::Prefetchable => true, + } + } +} + #[derive(Copy, Clone)] pub struct PciBarConfiguration { addr: u64, size: u64, - reg_idx: usize, + idx: usize, region_type: PciBarRegionType, prefetchable: PciBarPrefetchable, } @@ -350,9 +517,9 @@ impl Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::Error::*; match self { - BarAddressInvalid(a, s) => write!(f, "address {} size {} too big", a, s), - BarInUse(b) => write!(f, "bar {} already used", b), - BarInUse64(b) => write!(f, "64bit bar {} already used(requires two regs)", b), + BarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + BarInUse(b) => write!(f, "bar {b} already used"), + BarInUse64(b) => write!(f, "64bit bar {b} already used(requires two regs)"), BarInvalid(b) => write!(f, "bar {} invalid, max {}", b, NUM_BAR_REGS - 1), BarInvalid64(b) => write!( f, @@ -360,18 +527,18 @@ impl Display for Error { b, NUM_BAR_REGS - 1 ), - BarSizeInvalid(s) => write!(f, "bar address {} not a power of two", s), + BarSizeInvalid(s) => write!(f, "bar address {s} not a power of two"), CapabilityEmpty => write!(f, "empty capabilities are invalid"), - CapabilityLengthInvalid(l) => write!(f, "Invalid capability length {}", l), - CapabilitySpaceFull(s) => write!(f, "capability of size {} doesn't fit", s), + CapabilityLengthInvalid(l) => write!(f, "Invalid capability length {l}"), + CapabilitySpaceFull(s) => write!(f, "capability of size {s} doesn't fit"), Decode32BarSize => write!(f, "failed to decode 32 bits BAR size"), Decode64BarSize => write!(f, "failed to decode 64 bits BAR size"), Encode32BarSize => write!(f, "failed to encode 32 bits BAR size"), Encode64BarSize => write!(f, "failed to encode 64 bits BAR size"), - RomBarAddressInvalid(a, s) => write!(f, "address {} size {} too big", a, s), - RomBarInUse(b) => write!(f, "rom bar {} already used", b), + RomBarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + RomBarInUse(b) => write!(f, "rom bar {b} already used"), RomBarInvalid(b) => write!(f, "rom bar {} invalid, max {}", b, NUM_BAR_REGS - 1), - RomBarSizeInvalid(s) => write!(f, "rom bar address {} not a power of two", s), + RomBarSizeInvalid(s) => write!(f, "rom bar address {s} not a power of two"), } } } @@ -389,49 +556,95 @@ impl PciConfiguration { subsystem_vendor_id: u16, subsystem_id: u16, msix_config: Option>>, + state: Option, ) -> Self { - let mut registers = [0u32; NUM_CONFIGURATION_REGISTERS]; - let mut writable_bits = [0u32; NUM_CONFIGURATION_REGISTERS]; - registers[0] = u32::from(device_id) << 16 | u32::from(vendor_id); - // TODO(dverkamp): Status should be write-1-to-clear - writable_bits[1] = 0x0000_ffff; // Status (r/o), command (r/w) - let pi = if let Some(pi) = programming_interface { - pi.get_register_value() + let ( + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + ) = if let Some(state) = state { + ( + state.registers.try_into().unwrap(), + state.writable_bits.try_into().unwrap(), + state.bars.try_into().unwrap(), + state.rom_bar_addr, + state.rom_bar_size, + state.rom_bar_used, + state.last_capability, + state.msix_cap_reg_idx, + ) } else { - 0 - }; - registers[2] = u32::from(class_code.get_register_value()) << 24 - | u32::from(subclass.get_register_value()) << 16 - | u32::from(pi) << 8 - | u32::from(revision_id); - writable_bits[3] = 0x0000_00ff; // Cacheline size (r/w) - match header_type { - PciHeaderType::Device => { - registers[3] = 0x0000_0000; // Header type 0 (device) - writable_bits[15] = 0x0000_00ff; // Interrupt line (r/w) - } - PciHeaderType::Bridge => { - registers[3] = 0x0001_0000; // Header type 1 (bridge) - writable_bits[9] = 0xfff0_fff0; // Memory base and limit - writable_bits[15] = 0xffff_00ff; // Bridge control (r/w), interrupt line (r/w) - } + let mut registers = [0u32; NUM_CONFIGURATION_REGISTERS]; + let mut writable_bits = [0u32; NUM_CONFIGURATION_REGISTERS]; + registers[0] = u32::from(device_id) << 16 | u32::from(vendor_id); + // TODO(dverkamp): Status should be write-1-to-clear + writable_bits[1] = 0x0000_ffff; // Status (r/o), command (r/w) + let pi = if let Some(pi) = programming_interface { + pi.get_register_value() + } else { + 0 + }; + registers[2] = u32::from(class_code.get_register_value()) << 24 + | u32::from(subclass.get_register_value()) << 16 + | u32::from(pi) << 8 + | u32::from(revision_id); + writable_bits[3] = 0x0000_00ff; // Cacheline size (r/w) + match header_type { + PciHeaderType::Device => { + registers[3] = 0x0000_0000; // Header type 0 (device) + writable_bits[15] = 0x0000_00ff; // Interrupt line (r/w) + } + PciHeaderType::Bridge => { + registers[3] = 0x0001_0000; // Header type 1 (bridge) + writable_bits[9] = 0xfff0_fff0; // Memory base and limit + writable_bits[15] = 0xffff_00ff; // Bridge control (r/w), interrupt line (r/w) + } + }; + registers[11] = u32::from(subsystem_id) << 16 | u32::from(subsystem_vendor_id); + + ( + registers, + writable_bits, + [PciBar::default(); NUM_BAR_REGS], + 0, + 0, + false, + None, + None, + ) }; - registers[11] = u32::from(subsystem_id) << 16 | u32::from(subsystem_vendor_id); - let bars = [PciBar::default(); NUM_BAR_REGS]; PciConfiguration { registers, writable_bits, bars, - rom_bar_addr: 0, - rom_bar_size: 0, - rom_bar_used: false, - last_capability: None, - msix_cap_reg_idx: None, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, msix_config, } } + fn state(&self) -> PciConfigurationState { + PciConfigurationState { + registers: self.registers.to_vec(), + writable_bits: self.writable_bits.to_vec(), + bars: self.bars.to_vec(), + rom_bar_addr: self.rom_bar_addr, + rom_bar_size: self.rom_bar_size, + rom_bar_used: self.rom_bar_used, + last_capability: self.last_capability, + msix_cap_reg_idx: self.msix_cap_reg_idx, + } + } + /// Reads a 32bit register from `reg_idx` in the register map. pub fn read_reg(&self, reg_idx: usize) -> u32 { *(self.registers.get(reg_idx).unwrap_or(&0xffff_ffff)) @@ -511,48 +724,45 @@ impl PciConfiguration { /// Adds a region specified by `config`. Configures the specified BAR(s) to /// report this region and size to the guest kernel. Enforces a few constraints - /// (i.e, region size must be power of two, register not already used). Returns 'None' on - /// failure all, `Some(BarIndex)` on success. - pub fn add_pci_bar(&mut self, config: &PciBarConfiguration) -> Result { - if self.bars[config.reg_idx].used { - return Err(Error::BarInUse(config.reg_idx)); + /// (i.e, region size must be power of two, register not already used). + pub fn add_pci_bar(&mut self, config: &PciBarConfiguration) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = BAR0_REG + bar_idx; + + if self.bars[bar_idx].used { + return Err(Error::BarInUse(bar_idx)); } if config.size.count_ones() != 1 { return Err(Error::BarSizeInvalid(config.size)); } - if config.reg_idx >= NUM_BAR_REGS { - return Err(Error::BarInvalid(config.reg_idx)); + if bar_idx >= NUM_BAR_REGS { + return Err(Error::BarInvalid(bar_idx)); } - let bar_idx = BAR0_REG + config.reg_idx; let end_addr = config .addr .checked_add(config.size - 1) .ok_or(Error::BarAddressInvalid(config.addr, config.size))?; match config.region_type { PciBarRegionType::Memory32BitRegion | PciBarRegionType::IoRegion => { - if end_addr > u64::from(u32::max_value()) { + if end_addr > u64::from(u32::MAX) { return Err(Error::BarAddressInvalid(config.addr, config.size)); } // Encode the BAR size as expected by the software running in // the guest. - self.bars[config.reg_idx].size = + self.bars[bar_idx].size = encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; } PciBarRegionType::Memory64BitRegion => { - if config.reg_idx + 1 >= NUM_BAR_REGS { - return Err(Error::BarInvalid64(config.reg_idx)); + if bar_idx + 1 >= NUM_BAR_REGS { + return Err(Error::BarInvalid64(bar_idx)); } - if end_addr > u64::max_value() { - return Err(Error::BarAddressInvalid(config.addr, config.size)); - } - - if self.bars[config.reg_idx + 1].used { - return Err(Error::BarInUse64(config.reg_idx)); + if self.bars[bar_idx + 1].used { + return Err(Error::BarInUse64(bar_idx)); } // Encode the BAR size as expected by the software running in @@ -560,12 +770,12 @@ impl PciConfiguration { let (bar_size_hi, bar_size_lo) = encode_64_bits_bar_size(config.size).ok_or(Error::Encode64BarSize)?; - self.registers[bar_idx + 1] = (config.addr >> 32) as u32; - self.writable_bits[bar_idx + 1] = 0xffff_ffff; - self.bars[config.reg_idx + 1].addr = self.registers[bar_idx + 1]; - self.bars[config.reg_idx].size = bar_size_lo; - self.bars[config.reg_idx + 1].size = bar_size_hi; - self.bars[config.reg_idx + 1].used = true; + self.registers[reg_idx + 1] = (config.addr >> 32) as u32; + self.writable_bits[reg_idx + 1] = 0xffff_ffff; + self.bars[bar_idx + 1].addr = self.registers[reg_idx + 1]; + self.bars[bar_idx].size = bar_size_lo; + self.bars[bar_idx + 1].size = bar_size_hi; + self.bars[bar_idx + 1].used = true; } } @@ -577,26 +787,30 @@ impl PciConfiguration { PciBarRegionType::IoRegion => (BAR_IO_ADDR_MASK, config.region_type as u32), }; - self.registers[bar_idx] = ((config.addr as u32) & mask) | lower_bits; - self.writable_bits[bar_idx] = mask; - self.bars[config.reg_idx].addr = self.registers[bar_idx]; - self.bars[config.reg_idx].used = true; - self.bars[config.reg_idx].r#type = Some(config.region_type); - Ok(config.reg_idx) + self.registers[reg_idx] = ((config.addr as u32) & mask) | lower_bits; + self.writable_bits[reg_idx] = mask; + self.bars[bar_idx].addr = self.registers[reg_idx]; + self.bars[bar_idx].used = true; + self.bars[bar_idx].r#type = Some(config.region_type); + + Ok(()) } /// Adds rom expansion BAR. - pub fn add_pci_rom_bar(&mut self, config: &PciBarConfiguration, active: u32) -> Result { + pub fn add_pci_rom_bar(&mut self, config: &PciBarConfiguration, active: u32) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = ROM_BAR_REG; + if self.rom_bar_used { - return Err(Error::RomBarInUse(config.reg_idx)); + return Err(Error::RomBarInUse(bar_idx)); } if config.size.count_ones() != 1 { return Err(Error::RomBarSizeInvalid(config.size)); } - if config.reg_idx != ROM_BAR_REG { - return Err(Error::RomBarInvalid(config.reg_idx)); + if bar_idx != ROM_BAR_IDX { + return Err(Error::RomBarInvalid(bar_idx)); } let end_addr = config @@ -604,17 +818,18 @@ impl PciConfiguration { .checked_add(config.size - 1) .ok_or(Error::RomBarAddressInvalid(config.addr, config.size))?; - if end_addr > u64::from(u32::max_value()) { + if end_addr > u64::from(u32::MAX) { return Err(Error::RomBarAddressInvalid(config.addr, config.size)); } - self.registers[config.reg_idx] = (config.addr as u32) | active; - self.writable_bits[config.reg_idx] = ROM_BAR_ADDR_MASK; - self.rom_bar_addr = self.registers[config.reg_idx]; + self.registers[reg_idx] = (config.addr as u32) | active; + self.writable_bits[reg_idx] = ROM_BAR_ADDR_MASK; + self.rom_bar_addr = self.registers[reg_idx]; self.rom_bar_size = encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; self.rom_bar_used = true; - Ok(config.reg_idx) + + Ok(()) } /// Returns the address of the given BAR region. @@ -671,8 +886,15 @@ impl PciConfiguration { } self.last_capability = Some((cap_offset, total_len)); - if cap_data.id() == PciCapabilityId::MsiX { - self.msix_cap_reg_idx = Some(cap_offset / 4); + match cap_data.id() { + PciCapabilityId::MessageSignalledInterrupts => { + self.writable_bits[cap_offset / 4] = MSI_CAPABILITY_REGISTER_MASK; + } + PciCapabilityId::MsiX => { + self.msix_cap_reg_idx = Some(cap_offset / 4); + self.writable_bits[self.msix_cap_reg_idx.unwrap()] = MSIX_CAPABILITY_REGISTER_MASK; + } + _ => {} } Ok(cap_offset) @@ -697,6 +919,11 @@ impl PciConfiguration { .lock() .unwrap() .set_msg_ctl(LittleEndian::read_u16(data)); + } else if msix_cap_reg_idx == reg_idx && offset == 0 && data.len() == 4 { + msix_config + .lock() + .unwrap() + .set_msg_ctl((LittleEndian::read_u32(data) >> 16) as u16); } } } @@ -729,85 +956,77 @@ impl PciConfiguration { let mask = self.writable_bits[reg_idx]; if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Ignore the case where the BAR size is being asked for. + if value == 0xffff_ffff { + return None; + } + let bar_idx = reg_idx - 4; - if (value & mask) != (self.bars[bar_idx].addr & mask) { - // Handle special case where the address being written is - // different from the address initially provided. This is a - // BAR reprogramming case which needs to be properly caught. - if let Some(bar_type) = self.bars[bar_idx].r#type { - match bar_type { - PciBarRegionType::Memory64BitRegion => {} - _ => { - // Ignore the case where the BAR size is being - // asked for. - if value == 0xffff_ffff { - return None; - } - - debug!( - "DETECT BAR REPROG: current 0x{:x}, new 0x{:x}", - self.registers[reg_idx], value - ); - let old_base = u64::from(self.bars[bar_idx].addr & mask); - let new_base = u64::from(value & mask); - let len = u64::from( - decode_32_bits_bar_size(self.bars[bar_idx].size) - .ok_or(Error::Decode32BarSize) - .unwrap(), - ); - let region_type = bar_type; - - self.bars[bar_idx].addr = value; - - return Some(BarReprogrammingParams { - old_base, - new_base, - len, - region_type, - }); - } - } - } else if (reg_idx > BAR0_REG) - && (self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]) - != (self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]) - { - // Ignore the case where the BAR size is being asked for. - // Because we are in the 64bits case here, we have to check - // if the lower 32bits of the current BAR have already been - // asked for the BAR size too. - if value == 0xffff_ffff - && self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1] - == self.bars[bar_idx - 1].size & self.writable_bits[reg_idx - 1] - { - return None; - } - - debug!( - "DETECT BAR REPROG: current 0x{:x}, new 0x{:x}", - self.registers[reg_idx], value - ); - let old_base = u64::from(self.bars[bar_idx].addr & mask) << 32 - | u64::from(self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]); - let new_base = u64::from(value & mask) << 32 - | u64::from(self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]); - let len = decode_64_bits_bar_size( - self.bars[bar_idx].size, - self.bars[bar_idx - 1].size, - ) - .ok_or(Error::Decode64BarSize) - .unwrap(); - let region_type = PciBarRegionType::Memory64BitRegion; - - self.bars[bar_idx].addr = value; - self.bars[bar_idx - 1].addr = self.registers[reg_idx - 1]; - - return Some(BarReprogrammingParams { - old_base, - new_base, - len, - region_type, - }); + // Handle special case where the address being written is + // different from the address initially provided. This is a + // BAR reprogramming case which needs to be properly caught. + if let Some(bar_type) = self.bars[bar_idx].r#type { + // In case of 64 bits memory BAR, we don't do anything until + // the upper BAR is modified, otherwise we would be moving the + // BAR to a wrong location in memory. + if bar_type == PciBarRegionType::Memory64BitRegion { + return None; } + + // Ignore the case where the value is unchanged. + if (value & mask) == (self.bars[bar_idx].addr & mask) { + return None; + } + + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.bars[bar_idx].addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.bars[bar_idx].size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = bar_type; + + self.bars[bar_idx].addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } else if (reg_idx > BAR0_REG) + && ((self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]) + != (self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]) + || (value & mask) != (self.bars[bar_idx].addr & mask)) + { + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.bars[bar_idx].addr & mask) << 32 + | u64::from(self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]); + let new_base = u64::from(value & mask) << 32 + | u64::from(self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]); + let len = + decode_64_bits_bar_size(self.bars[bar_idx].size, self.bars[bar_idx - 1].size) + .ok_or(Error::Decode64BarSize) + .unwrap(); + let region_type = PciBarRegionType::Memory64BitRegion; + + self.bars[bar_idx].addr = value; + self.bars[bar_idx - 1].addr = self.registers[reg_idx - 1]; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); } } else if reg_idx == ROM_BAR_REG && (value & mask) != (self.rom_bar_addr & mask) { // Ignore the case where the BAR size is being asked for. @@ -815,9 +1034,9 @@ impl PciConfiguration { return None; } - debug!( - "DETECT ROM BAR REPROG: current 0x{:x}, new 0x{:x}", - self.registers[reg_idx], value + info!( + "Detected ROM BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value ); let old_base = u64::from(self.rom_bar_addr & mask); let new_base = u64::from(value & mask); @@ -845,7 +1064,7 @@ impl PciConfiguration { impl Default for PciBarConfiguration { fn default() -> Self { PciBarConfiguration { - reg_idx: 0, + idx: 0, addr: 0, size: 0, region_type: PciBarRegionType::Memory64BitRegion, @@ -856,13 +1075,13 @@ impl Default for PciBarConfiguration { impl PciBarConfiguration { pub fn new( - reg_idx: usize, + idx: usize, size: u64, region_type: PciBarRegionType, prefetchable: PciBarPrefetchable, ) -> Self { PciBarConfiguration { - reg_idx, + idx, addr: 0, size, region_type, @@ -870,33 +1089,55 @@ impl PciBarConfiguration { } } - pub fn set_register_index(mut self, reg_idx: usize) -> Self { - self.reg_idx = reg_idx; + #[must_use] + pub fn set_index(mut self, idx: usize) -> Self { + self.idx = idx; self } + #[must_use] pub fn set_address(mut self, addr: u64) -> Self { self.addr = addr; self } + #[must_use] pub fn set_size(mut self, size: u64) -> Self { self.size = size; self } - pub fn get_size(&self) -> u64 { - self.size - } - + #[must_use] pub fn set_region_type(mut self, region_type: PciBarRegionType) -> Self { self.region_type = region_type; self } - pub fn set_prefetch(mut self, prefetchable: PciBarPrefetchable) -> Self { + + #[must_use] + pub fn set_prefetchable(mut self, prefetchable: PciBarPrefetchable) -> Self { self.prefetchable = prefetchable; self } + + pub fn idx(&self) -> usize { + self.idx + } + + pub fn addr(&self) -> u64 { + self.addr + } + + pub fn size(&self) -> u64 { + self.size + } + + pub fn region_type(&self) -> PciBarRegionType { + self.region_type + } + + pub fn prefetchable(&self) -> PciBarPrefetchable { + self.prefetchable + } } #[cfg(test)] @@ -913,7 +1154,7 @@ mod tests { foo: u8, } - // It is safe to implement BytesValued; all members are simple numbers and any value is valid. + // SAFETY: All members are simple numbers and any value is valid. unsafe impl ByteValued for TestCap {} impl PciCapability for TestCap { @@ -939,6 +1180,7 @@ mod tests { 0xABCD, 0x2468, None, + None, ); // Add two capabilities with different contents. @@ -995,6 +1237,7 @@ mod tests { 0xABCD, 0x2468, None, + None, ); let class_reg = cfg.read_reg(2); diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs index 1e9ef45fe5b..fec1c149e2b 100644 --- a/src/pci/src/device.rs +++ b/src/pci/src/device.rs @@ -1,14 +1,19 @@ // Copyright 2018 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use crate::configuration::{self, PciBarRegionType}; use std::any::Any; use std::fmt::{self, Display}; -use std::sync::{Arc, Barrier}; -use std::{self, io, result}; -use vm_memory::{GuestAddress, GuestUsize}; -use vm_system_allocator::SystemAllocator; +use std::sync::{Arc, Barrier, Mutex}; +use std::{io, result}; + +use vm_system_allocator::{AddressAllocator, SystemAllocator}; +use vm_device::Resource; + +use crate::configuration::{self, PciBarRegionType}; +use crate::PciBarConfiguration; #[derive(Debug)] pub enum Error { @@ -18,6 +23,10 @@ pub enum Error { IoAllocationFailed(u64), /// Registering an IO BAR failed. IoRegistrationFailed(u64, configuration::Error), + /// Expected resource not found. + MissingResource, + /// Invalid resource. + InvalidResource(Resource), } pub type Result = std::result::Result; @@ -26,13 +35,15 @@ impl Display for Error { use self::Error::*; match self { - CapabilitiesSetup(e) => write!(f, "failed to add capability {}", e), + CapabilitiesSetup(e) => write!(f, "failed to add capability {e}"), IoAllocationFailed(size) => { - write!(f, "failed to allocate space for an IO BAR, size={}", size) + write!(f, "failed to allocate space for an IO BAR, size={size}") } IoRegistrationFailed(addr, e) => { - write!(f, "failed to register an IO BAR, addr={} err={}", addr, e) + write!(f, "failed to register an IO BAR, addr={addr} err={e}") } + MissingResource => write!(f, "failed to find expected resource"), + InvalidResource(r) => write!(f, "invalid resource {r:?}"), } } } @@ -45,24 +56,32 @@ pub struct BarReprogrammingParams { pub region_type: PciBarRegionType, } -pub trait PciDevice { +pub trait PciDevice: Send { /// Allocates the needed PCI BARs space using the `allocate` function which takes a size and /// returns an address. Returns a Vec of (GuestAddress, GuestUsize) tuples. fn allocate_bars( &mut self, - _allocator: &mut SystemAllocator, - ) -> Result> { + _allocator: &Arc>, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + _resources: Option>, + ) -> Result> { Ok(Vec::new()) } /// Frees the PCI BARs previously allocated with a call to allocate_bars(). - fn free_bars(&mut self, _allocator: &mut SystemAllocator) -> Result<()> { + fn free_bars( + &mut self, + _allocator: &mut SystemAllocator, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + ) -> Result<()> { Ok(()) } /// Sets a register in the configuration space. /// * `reg_idx` - The index of the config register to modify. - /// * `offset` - Offset in to the register. + /// * `offset` - Offset into the register. fn write_config_register( &mut self, reg_idx: usize, @@ -72,12 +91,19 @@ pub trait PciDevice { /// Gets a register from the configuration space. /// * `reg_idx` - The index of the config register to read. fn read_config_register(&mut self, reg_idx: usize) -> u32; - - /// Reads from a BAR region mapped in to the device. + /// Detects if a BAR is being reprogrammed. + fn detect_bar_reprogramming( + &mut self, + _reg_idx: usize, + _data: &[u8], + ) -> Option { + None + } + /// Reads from a BAR region mapped into the device. /// * `addr` - The guest address inside the BAR. /// * `data` - Filled with the data from `addr`. fn read_bar(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} - /// Writes to a BAR region mapped in to the device. + /// Writes to a BAR region mapped into the device. /// * `addr` - The guest address inside the BAR. /// * `data` - The data to write. fn write_bar(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { @@ -90,14 +116,9 @@ pub trait PciDevice { /// Provides a mutable reference to the Any trait. This is useful to let /// the caller have access to the underlying type behind the trait. fn as_any(&mut self) -> &mut dyn Any; - /// Detects if a BAR is being reprogrammed. - fn detect_bar_reprogramming( - &mut self, - _reg_idx: usize, - _data: &[u8], - ) -> Option { - None - } + + /// Optionally returns a unique identifier. + fn id(&self) -> Option; } /// This trait defines a set of functions which can be triggered whenever a diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index e95b1dd03a5..6d7c8272345 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -1,8 +1,12 @@ // Copyright 2018 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause //! Implements pci devices and busses. +#[macro_use] +extern crate log; // mod bus; pub mod configuration; @@ -11,21 +15,26 @@ pub mod msi; pub mod msix; pub mod vfio; -// pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; -pub use self::device::{ - BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, -}; +use std::fmt::{self, Display}; +use std::num::ParseIntError; +use std::str::FromStr; +use serde::de::Visitor; + +// pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; pub use self::configuration::{ PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, - PciClassCode, PciConfiguration, PciHeaderType, PciMassStorageSubclass, + PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, + PCI_CONFIGURATION_ID, +}; +pub use self::device::{ + BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; - pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; -pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_TABLE_ENTRY_SIZE}; +pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE}; +pub use self::vfio::{MmioRegion, VfioDmaMapping, VfioPciDevice, VfioPciError}; -pub use self::vfio::{VfioPciDevice, VfioPciError}; /// PCI has four interrupt pins A->D. #[derive(Copy, Clone)] pub enum PciInterruptPin { @@ -40,3 +49,138 @@ impl PciInterruptPin { self as u32 } } + +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT: u64 = 0xcf8; +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT_SIZE: u64 = 0x8; + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub struct PciBdf(u32); + +struct PciBdfVisitor; + +impl<'de> Visitor<'de> for PciBdfVisitor { + type Value = PciBdf; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("struct PciBdf") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(v.into()) + } +} + +impl<'de> serde::Deserialize<'de> for PciBdf { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_str(PciBdfVisitor) + } +} + +impl serde::Serialize for PciBdf { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.collect_str(&self.to_string()) + } +} + +impl PciBdf { + pub fn segment(&self) -> u16 { + ((self.0 >> 16) & 0xffff) as u16 + } + + pub fn bus(&self) -> u8 { + ((self.0 >> 8) & 0xff) as u8 + } + + pub fn device(&self) -> u8 { + ((self.0 >> 3) & 0x1f) as u8 + } + + pub fn function(&self) -> u8 { + (self.0 & 0x7) as u8 + } + + pub fn new(segment: u16, bus: u8, device: u8, function: u8) -> Self { + Self( + (segment as u32) << 16 + | (bus as u32) << 8 + | ((device & 0x1f) as u32) << 3 + | (function & 0x7) as u32, + ) + } +} + +impl From for PciBdf { + fn from(bdf: u32) -> Self { + Self(bdf) + } +} + +impl From for u32 { + fn from(bdf: PciBdf) -> Self { + bdf.0 + } +} + +impl From<&PciBdf> for u32 { + fn from(bdf: &PciBdf) -> Self { + bdf.0 + } +} + +impl From for u16 { + fn from(bdf: PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl From<&PciBdf> for u16 { + fn from(bdf: &PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl Display for PciBdf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:01x}", + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } +} + +impl FromStr for PciBdf { + type Err = ParseIntError; + + fn from_str(s: &str) -> Result { + let items: Vec<&str> = s.split('.').collect(); + assert_eq!(items.len(), 2); + let function = u8::from_str_radix(items[1], 16)?; + let items: Vec<&str> = items[0].split(':').collect(); + assert_eq!(items.len(), 3); + let segment = u16::from_str_radix(items[0], 16)?; + let bus = u8::from_str_radix(items[1], 16)?; + let device = u8::from_str_radix(items[2], 16)?; + Ok(PciBdf::new(segment, bus, device, function)) + } +} + +impl From<&str> for PciBdf { + fn from(bdf: &str) -> Self { + Self::from_str(bdf).unwrap() + } +} diff --git a/src/pci/src/msi.rs b/src/pci/src/msi.rs index 0485f4c2f71..c8b41e68823 100644 --- a/src/pci/src/msi.rs +++ b/src/pci/src/msi.rs @@ -3,17 +3,16 @@ // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause // -extern crate byteorder; -extern crate vm_memory; +use std::io; +use std::sync::Arc; use byteorder::{ByteOrder, LittleEndian}; -use std::sync::Arc; +use serde::{Deserialize, Serialize}; +use thiserror::Error; use vm_device::interrupt::{ InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, }; -use log::error; - // MSI control masks const MSI_CTL_ENABLE: u16 = 0x1; const MSI_CTL_MULTI_MSG_ENABLE: u16 = 0x70; @@ -37,7 +36,17 @@ pub fn msi_num_enabled_vectors(msg_ctl: u16) -> usize { 1 << field } -#[derive(Clone, Copy, Default)] +#[derive(Error, Debug)] +pub enum Error { + #[error("Failed enabling the interrupt route: {0}")] + EnableInterruptRoute(io::Error), + #[error("Failed updating the interrupt route: {0}")] + UpdateInterruptRoute(io::Error), +} + +pub const MSI_CONFIG_ID: &str = "msi_config"; + +#[derive(Clone, Copy, Default, Serialize, Deserialize)] pub struct MsiCap { // Message Control Register // 0: MSI enable. @@ -162,22 +171,67 @@ impl MsiCap { } } -pub struct MsiConfig { +#[derive(Serialize, Deserialize)] +pub struct MsiConfigState { cap: MsiCap, - interrupt_source_group: Arc>, +} + +pub struct MsiConfig { + pub cap: MsiCap, + interrupt_source_group: Arc, } impl MsiConfig { - pub fn new(msg_ctl: u16, interrupt_source_group: Arc>) -> Self { - let cap = MsiCap { - msg_ctl, - ..Default::default() + pub fn new( + msg_ctl: u16, + interrupt_source_group: Arc, + state: Option, + ) -> Result { + let cap = if let Some(state) = state { + if state.cap.enabled() { + for idx in 0..state.cap.num_enabled_vectors() { + let config = MsiIrqSourceConfig { + high_addr: state.cap.msg_addr_hi, + low_addr: state.cap.msg_addr_lo, + data: state.cap.msg_data as u32, + devid: 0, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.cap.vector_masked(idx), + false, + ) + .map_err(Error::UpdateInterruptRoute)?; + } + + interrupt_source_group + .set_gsi() + .map_err(Error::EnableInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + + state.cap + } else { + MsiCap { + msg_ctl, + ..Default::default() + } }; - MsiConfig { + Ok(MsiConfig { cap, interrupt_source_group, - } + }) + } + + fn state(&self) -> MsiConfigState { + MsiConfigState { cap: self.cap } } pub fn enabled(&self) -> bool { @@ -206,20 +260,14 @@ impl MsiConfig { devid: 0, }; - if let Err(e) = self - .interrupt_source_group - .update(idx as InterruptIndex, InterruptSourceConfig::MsiIrq(config)) - { + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + self.cap.vector_masked(idx), + true, + ) { error!("Failed updating vector: {:?}", e); } - - if self.cap.vector_masked(idx) { - if let Err(e) = self.interrupt_source_group.mask(idx as InterruptIndex) { - error!("Failed masking vector: {:?}", e); - } - } else if let Err(e) = self.interrupt_source_group.unmask(idx as InterruptIndex) { - error!("Failed unmasking vector: {:?}", e); - } } if !old_enabled { diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs index d5d7eb6bec6..c393085667c 100644 --- a/src/pci/src/msix.rs +++ b/src/pci/src/msix.rs @@ -3,18 +3,17 @@ // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause // -extern crate byteorder; -extern crate vm_memory; +use std::sync::Arc; +use std::{io, result}; -use crate::{PciCapability, PciCapabilityId}; use byteorder::{ByteOrder, LittleEndian}; -use std::sync::Arc; +use serde::{Deserialize, Serialize}; use vm_device::interrupt::{ InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, }; use vm_memory::ByteValued; -use log::{debug, error}; +use crate::{PciCapability, PciCapabilityId}; const MAX_MSIX_VECTORS_PER_DEVICE: u16 = 2048; const MSIX_TABLE_ENTRIES_MODULO: u64 = 16; @@ -25,8 +24,17 @@ const MSIX_ENABLE_BIT: u8 = 15; const FUNCTION_MASK_MASK: u16 = (1 << FUNCTION_MASK_BIT) as u16; const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; +pub const MSIX_CONFIG_ID: &str = "msix_config"; + +#[derive(Debug)] +pub enum Error { + /// Failed enabling the interrupt route. + EnableInterruptRoute(io::Error), + /// Failed updating the interrupt route. + UpdateInterruptRoute(io::Error), +} -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] pub struct MsixTableEntry { pub msg_addr_lo: u32, pub msg_addr_hi: u32, @@ -51,11 +59,19 @@ impl Default for MsixTableEntry { } } +#[derive(Serialize, Deserialize)] +pub struct MsixConfigState { + table_entries: Vec, + pba_entries: Vec, + masked: bool, + enabled: bool, +} + pub struct MsixConfig { pub table_entries: Vec, pub pba_entries: Vec, pub devid: u32, - interrupt_source_group: Arc>, + interrupt_source_group: Arc, masked: bool, enabled: bool, } @@ -63,24 +79,73 @@ pub struct MsixConfig { impl MsixConfig { pub fn new( msix_vectors: u16, - interrupt_source_group: Arc>, + interrupt_source_group: Arc, devid: u32, - ) -> Self { + state: Option, + ) -> result::Result { assert!(msix_vectors <= MAX_MSIX_VECTORS_PER_DEVICE); - let mut table_entries: Vec = Vec::new(); - table_entries.resize_with(msix_vectors as usize, Default::default); - let mut pba_entries: Vec = Vec::new(); - let num_pba_entries: usize = ((msix_vectors as usize) / BITS_PER_PBA_ENTRY) + 1; - pba_entries.resize_with(num_pba_entries, Default::default); + let (table_entries, pba_entries, masked, enabled) = if let Some(state) = state { + if state.enabled && !state.masked { + for (idx, table_entry) in state.table_entries.iter().enumerate() { + if table_entry.masked() { + continue; + } - MsixConfig { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.masked, + true, + ) + .map_err(Error::UpdateInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + } + + ( + state.table_entries, + state.pba_entries, + state.masked, + state.enabled, + ) + } else { + let mut table_entries: Vec = Vec::new(); + table_entries.resize_with(msix_vectors as usize, Default::default); + let mut pba_entries: Vec = Vec::new(); + let num_pba_entries: usize = ((msix_vectors as usize) / BITS_PER_PBA_ENTRY) + 1; + pba_entries.resize_with(num_pba_entries, Default::default); + + (table_entries, pba_entries, true, false) + }; + + Ok(MsixConfig { table_entries, pba_entries, devid, interrupt_source_group, - masked: true, - enabled: false, + masked, + enabled, + }) + } + + fn state(&self) -> MsixConfigState { + MsixConfigState { + table_entries: self.table_entries.clone(), + pba_entries: self.pba_entries.clone(), + masked: self.masked, + enabled: self.enabled, } } @@ -102,6 +167,7 @@ impl MsixConfig { // Update interrupt routing if old_masked != self.masked || old_enabled != self.enabled { if self.enabled && !self.masked { + debug!("MSI-X enabled for device 0x{:x}", self.devid); for (idx, table_entry) in self.table_entries.iter().enumerate() { let config = MsiIrqSourceConfig { high_addr: table_entry.msg_addr_hi, @@ -110,23 +176,17 @@ impl MsixConfig { devid: self.devid, }; - if let Err(e) = self - .interrupt_source_group - .update(idx as InterruptIndex, InterruptSourceConfig::MsiIrq(config)) - { + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { error!("Failed updating vector: {:?}", e); } - - if table_entry.masked() { - if let Err(e) = self.interrupt_source_group.mask(idx as InterruptIndex) { - error!("Failed masking vector: {:?}", e); - } - } else if let Err(e) = self.interrupt_source_group.unmask(idx as InterruptIndex) - { - error!("Failed unmasking vector: {:?}", e); - } } } else if old_enabled || !old_masked { + debug!("MSI-X disabled for device 0x{:x}", self.devid); if let Err(e) = self.interrupt_source_group.disable() { error!("Failed disabling irq_fd: {:?}", e); } @@ -152,6 +212,12 @@ impl MsixConfig { let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + if index >= self.table_entries.len() { + debug!("Invalid MSI-X table entry index {index}"); + data.copy_from_slice(&[0xff; 8][..data.len()]); + return; + } + match data.len() { 4 => { let value = match modulo_offset { @@ -199,8 +265,13 @@ impl MsixConfig { let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + if index >= self.table_entries.len() { + debug!("Invalid MSI-X table entry index {index}"); + return; + } + // Store the value of the entry before modification - let mut old_entry: Option = None; + let old_entry = self.table_entries[index].clone(); match data.len() { 4 => { @@ -210,7 +281,6 @@ impl MsixConfig { 0x4 => self.table_entries[index].msg_addr_hi = value, 0x8 => self.table_entries[index].msg_data = value, 0xc => { - old_entry = Some(self.table_entries[index].clone()); self.table_entries[index].vector_ctl = value; } _ => error!("invalid offset"), @@ -226,7 +296,6 @@ impl MsixConfig { self.table_entries[index].msg_addr_hi = (value >> 32) as u32; } 0x8 => { - old_entry = Some(self.table_entries[index].clone()); self.table_entries[index].msg_data = (value & 0xffff_ffffu64) as u32; self.table_entries[index].vector_ctl = (value >> 32) as u32; } @@ -238,10 +307,18 @@ impl MsixConfig { _ => error!("invalid data length"), }; - // Update interrupt routes - if self.enabled && !self.masked { - let table_entry = &self.table_entries[index]; + let table_entry = &self.table_entries[index]; + + // Optimisation to avoid excessive updates + if &old_entry == table_entry { + return; + } + // Update interrupt routes + // Optimisation: only update routes if the entry is not masked; + // this is safe because if the entry is masked (starts masked as per spec) + // in the table then it won't be triggered. (See: #4273) + if self.enabled && !self.masked && !table_entry.masked() { let config = MsiIrqSourceConfig { high_addr: table_entry.msg_addr_hi, low_addr: table_entry.msg_addr_lo, @@ -252,17 +329,11 @@ impl MsixConfig { if let Err(e) = self.interrupt_source_group.update( index as InterruptIndex, InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, ) { error!("Failed updating vector: {:?}", e); } - - if table_entry.masked() { - if let Err(e) = self.interrupt_source_group.mask(index as InterruptIndex) { - error!("Failed masking vector: {:?}", e); - } - } else if let Err(e) = self.interrupt_source_group.unmask(index as InterruptIndex) { - error!("Failed unmasking vector: {:?}", e); - } } // After the MSI-X table entry has been updated, it is necessary to @@ -272,15 +343,15 @@ impl MsixConfig { // has been injected, the pending bit in the PBA needs to be cleared. // All of this is valid only if MSI-X has not been masked for the whole // device. - if let Some(old_entry) = old_entry { - // Check if bit has been flipped - if !self.masked() - && old_entry.masked() - && !self.table_entries[index].masked() - && self.get_pba_bit(index as u16) == 1 - { - self.inject_msix_and_clear_pba(index); - } + + // Check if bit has been flipped + if !self.masked() + && self.enabled() + && old_entry.masked() + && !table_entry.masked() + && self.get_pba_bit(index as u16) == 1 + { + self.inject_msix_and_clear_pba(index); } } @@ -290,6 +361,12 @@ impl MsixConfig { let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize; let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO; + if index >= self.pba_entries.len() { + debug!("Invalid MSI-X PBA entry index {index}"); + data.copy_from_slice(&[0xff; 8][..data.len()]); + return; + } + match data.len() { 4 => { let value: u32 = match modulo_offset { @@ -367,7 +444,7 @@ impl MsixConfig { #[allow(dead_code)] #[repr(packed)] -#[derive(Clone, Copy, Default)] +#[derive(Clone, Copy, Default, Serialize, Deserialize)] pub struct MsixCap { // Message Control Register // 10-0: MSI-X Table size @@ -385,7 +462,7 @@ pub struct MsixCap { pub pba: u32, } -// It is safe to implement ByteValued. All members are simple numbers and any value is valid. +// SAFETY: All members are simple numbers and any value is valid. unsafe impl ByteValued for MsixCap {} impl PciCapability for MsixCap { @@ -439,6 +516,16 @@ impl MsixCap { self.pba & 0xffff_fff8 } + pub fn table_set_offset(&mut self, addr: u32) { + self.table &= 0x7; + self.table += addr; + } + + pub fn pba_set_offset(&mut self, addr: u32) { + self.pba &= 0x7; + self.pba += addr; + } + pub fn table_bir(&self) -> u32 { self.table & 0x7 } @@ -450,4 +537,16 @@ impl MsixCap { pub fn table_size(&self) -> u16 { (self.msg_ctl & 0x7ff) + 1 } + + pub fn table_range(&self) -> (u64, u64) { + // The table takes 16 bytes per entry. + let size = self.table_size() as u64 * 16; + (self.table_offset() as u64, size) + } + + pub fn pba_range(&self) -> (u64, u64) { + // The table takes 1 bit per entry modulo 8 bytes. + let size = ((self.table_size() as u64 / 64) + 1) * 8; + (self.pba_offset() as u64, size) + } } diff --git a/src/pci/src/vfio.rs b/src/pci/src/vfio.rs index 7f30cd54482..de7a962b7d7 100644 --- a/src/pci/src/vfio.rs +++ b/src/pci/src/vfio.rs @@ -2,77 +2,81 @@ // // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause // -use crate::{ - msi_num_enabled_vectors, BarReprogrammingParams, MsiConfig, MsixCap, MsixConfig, - PciBarConfiguration, PciBarRegionType, PciCapabilityId, PciClassCode, PciConfiguration, - PciDevice, PciDeviceError, PciHeaderType, PciSubclass, MSIX_TABLE_ENTRY_SIZE, -}; -use byteorder::{ByteOrder, LittleEndian}; -use kvm_ioctls::VmFd; +use core::fmt; use std::any::Any; +use std::collections::{BTreeMap, HashMap}; use std::fmt::{Debug, Formatter}; +use std::io; use std::os::unix::io::AsRawFd; use std::ptr::null_mut; use std::sync::{Arc, Barrier, Mutex}; -use std::{fmt, io, result}; -use vfio_bindings::bindings::vfio::*; -use vfio_ioctls::{VfioContainer, VfioDevice, VfioError}; +use anyhow::{anyhow, Error}; +use byteorder::{ByteOrder, LittleEndian}; +use kvm_ioctls::VmFd; +use libc::{sysconf, _SC_PAGESIZE}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vfio_bindings::bindings::vfio::*; +use vfio_ioctls::{ + VfioContainer, VfioDevice, VfioIrq, VfioRegionInfoCap, VfioRegionSparseMmapArea, +}; +use vm_system_allocator::page_size::{ + align_page_size_down, align_page_size_up, is_4k_aligned, is_4k_multiple, is_page_size_aligned, +}; +use vm_system_allocator::{AddressAllocator, SystemAllocator}; +use vm_device::dma_mapping::ExternalDmaMapping; use vm_device::interrupt::{ - InterruptIndex, InterruptManager, InterruptSourceGroup, - MsiIrqGroupConfig + InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, }; - -use vm_memory::{Address, GuestAddress, GuestUsize}; +use vm_device::Resource; +use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsize}; use vmm_sys_util::eventfd::EventFd; -use vm_system_allocator::SystemAllocator; + +use crate::msi::{MsiConfigState, MSI_CONFIG_ID}; +use crate::msix::MsixConfigState; +use crate::{ + msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, + PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, + PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, + PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID, +}; pub use kvm_bindings::kvm_userspace_memory_region as MemoryRegion; -use log::error; +pub(crate) const VFIO_COMMON_ID: &str = "vfio_common"; -#[derive(Debug)] +#[derive(Debug, Error)] pub enum VfioPciError { - AllocateGsi, - DmaMap(VfioError), - DmaUnmap(VfioError), - EnableIntx(VfioError), - EnableMsi(VfioError), - EnableMsix(VfioError), - EventFd(io::Error), - InterruptSourceGroupCreate(io::Error), - // IrqFd(hypervisor::HypervisorVmError), - MapRegionGuest(anyhow::Error), + #[error("Failed to create user memory region: {0}")] + MapRegionGuest(#[source] Error), + #[error("Failed to DMA map: {0}")] + DmaMap(#[source] vfio_ioctls::VfioError), + #[error("Failed to DMA unmap: {0}")] + DmaUnmap(#[source] vfio_ioctls::VfioError), + #[error("Failed to enable INTx: {0}")] + EnableIntx(#[source] VfioError), + #[error("Failed to enable MSI: {0}")] + EnableMsi(#[source] VfioError), + #[error("Failed to enable MSI-x: {0}")] + EnableMsix(#[source] VfioError), + #[error("Failed to mmap the area")] + MmapArea, + #[error("Failed to notifier's eventfd")] MissingNotifier, - MsiNotConfigured, - MsixNotConfigured, - NewVfioPciDevice, -} -pub type Result = std::result::Result; - -impl fmt::Display for VfioPciError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - VfioPciError::AllocateGsi => write!(f, "failed to allocate GSI"), - VfioPciError::DmaMap(e) => write!(f, "failed to DMA map: {}", e), - VfioPciError::DmaUnmap(e) => write!(f, "failed to DMA unmap: {}", e), - VfioPciError::EnableIntx(e) => write!(f, "failed to enable INTx: {}", e), - VfioPciError::EnableMsi(e) => write!(f, "failed to enable MSI: {}", e), - VfioPciError::EnableMsix(e) => write!(f, "failed to enable MSI-X: {}", e), - VfioPciError::EventFd(e) => write!(f, "failed to create eventfd: {}", e), - VfioPciError::InterruptSourceGroupCreate(e) => { - write!(f, "failed to create interrupt source group: {}", e) - } - VfioPciError::MapRegionGuest(e) => { - write!(f, "failed to map VFIO PCI region into guest: {}", e) - } - VfioPciError::MissingNotifier => write!(f, "failed to notifier's eventfd"), - VfioPciError::MsiNotConfigured => write!(f, "MSI interrupt not yet configured"), - VfioPciError::MsixNotConfigured => write!(f, "MSI-X interrupt not yet configured"), - VfioPciError::NewVfioPciDevice => write!(f, "failed to create VFIO PCI device"), - } - } + #[error("Invalid region alignment")] + RegionAlignment, + #[error("Invalid region size")] + RegionSize, + #[error("Failed to retrieve MsiConfigState: {0}")] + RetrieveMsiConfigState(#[source] anyhow::Error), + #[error("Failed to retrieve MsixConfigState: {0}")] + RetrieveMsixConfigState(#[source] anyhow::Error), + #[error("Failed to retrieve PciConfigurationState: {0}")] + RetrievePciConfigurationState(#[source] anyhow::Error), + #[error("Failed to retrieve VfioCommonState: {0}")] + RetrieveVfioCommonState(#[source] anyhow::Error), } #[derive(Copy, Clone)] @@ -93,15 +97,26 @@ enum InterruptUpdateAction { DisableMsix, } -struct VfioIntx { - interrupt_source_group: Arc>, +#[derive(Serialize, Deserialize)] +struct IntxState { + enabled: bool, +} + +pub(crate) struct VfioIntx { + interrupt_source_group: Arc, enabled: bool, } -struct VfioMsi { - cfg: MsiConfig, +#[derive(Serialize, Deserialize)] +struct MsiState { + cap: MsiCap, + cap_offset: u32, +} + +pub(crate) struct VfioMsi { + pub(crate) cfg: MsiConfig, cap_offset: u32, - interrupt_source_group: Arc>, + interrupt_source_group: Arc, } impl VfioMsi { @@ -124,11 +139,18 @@ impl VfioMsi { } } -struct VfioMsix { - bar: MsixConfig, +#[derive(Serialize, Deserialize)] +struct MsixState { + cap: MsixCap, + cap_offset: u32, + bdf: u32, +} + +pub(crate) struct VfioMsix { + pub(crate) bar: MsixConfig, cap: MsixCap, cap_offset: u32, - interrupt_source_group: Arc>, + interrupt_source_group: Arc, } impl VfioMsix { @@ -162,10 +184,10 @@ impl VfioMsix { } } -struct Interrupt { - intx: Option, - msi: Option, - msix: Option, +pub(crate) struct Interrupt { + pub(crate) intx: Option, + pub(crate) msi: Option, + pub(crate) msix: Option, } impl Interrupt { @@ -230,7 +252,7 @@ impl Interrupt { } } - fn intx_in_use(&self) -> bool { + pub(crate) fn intx_in_use(&self) -> bool { if let Some(intx) = &self.intx { return intx.enabled; } @@ -239,344 +261,843 @@ impl Interrupt { } } - #[derive(Copy, Clone)] +pub struct UserMemoryRegion { + pub slot: u32, + pub start: u64, + pub size: u64, + pub host_addr: u64, +} + +#[derive(Clone)] pub struct MmioRegion { pub start: GuestAddress, pub length: GuestUsize, - type_: PciBarRegionType, - index: u32, - mem_slot: Option, - pub host_addr: Option, - mmap_size: Option, + pub(crate) type_: PciBarRegionType, + pub(crate) index: u32, + pub(crate) user_memory_regions: Vec, } -struct VfioPciConfig { - device: Arc, +trait MmioRegionRange { + fn check_range(&self, guest_addr: u64, size: u64) -> bool; + fn find_user_address(&self, guest_addr: u64) -> Result; } -impl VfioPciConfig { - fn new(device: Arc) -> Self { - VfioPciConfig { device } +impl MmioRegionRange for Vec { + // Check if a guest address is within the range of mmio regions + fn check_range(&self, guest_addr: u64, size: u64) -> bool { + for region in self.iter() { + let Some(guest_addr_end) = guest_addr.checked_add(size) else { + return false; + }; + let Some(region_end) = region.start.raw_value().checked_add(region.length) else { + return false; + }; + if guest_addr >= region.start.raw_value() && guest_addr_end <= region_end { + return true; + } + } + false + } + + // Locate the user region address for a guest address within all mmio regions + fn find_user_address(&self, guest_addr: u64) -> Result { + for region in self.iter() { + for user_region in region.user_memory_regions.iter() { + if guest_addr >= user_region.start + && guest_addr < user_region.start + user_region.size + { + return Ok(user_region.host_addr + (guest_addr - user_region.start)); + } + } + } + + Err(io::Error::new( + io::ErrorKind::Other, + format!("unable to find user address: 0x{guest_addr:x}"), + )) } +} + +#[derive(Debug, Error)] +pub enum VfioError { + #[error("Kernel VFIO error: {0}")] + KernelVfio(#[source] vfio_ioctls::VfioError), +} +pub(crate) trait Vfio: Send + Sync { fn read_config_byte(&self, offset: u32) -> u8 { let mut data: [u8; 1] = [0]; - self.device - .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); - + self.read_config(offset, &mut data); data[0] } fn read_config_word(&self, offset: u32) -> u16 { let mut data: [u8; 2] = [0, 0]; - self.device - .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); - + self.read_config(offset, &mut data); u16::from_le_bytes(data) } fn read_config_dword(&self, offset: u32) -> u32 { let mut data: [u8; 4] = [0, 0, 0, 0]; - self.device - .region_read(VFIO_PCI_CONFIG_REGION_INDEX, data.as_mut(), offset.into()); - + self.read_config(offset, &mut data); u32::from_le_bytes(data) } - fn write_config_dword(&self, buf: u32, offset: u32) { + fn write_config_dword(&self, offset: u32, buf: u32) { let data: [u8; 4] = buf.to_le_bytes(); - self.device - .region_write(VFIO_PCI_CONFIG_REGION_INDEX, &data, offset.into()) + self.write_config(offset, &data) + } + + fn read_config(&self, offset: u32, data: &mut [u8]) { + self.region_read(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data.as_mut()); + } + + fn write_config(&self, offset: u32, data: &[u8]) { + self.region_write(VFIO_PCI_CONFIG_REGION_INDEX, offset.into(), data) + } + + fn enable_msi(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { + self.enable_irq(VFIO_PCI_MSI_IRQ_INDEX, fds) + } + + fn disable_msi(&self) -> Result<(), VfioError> { + self.disable_irq(VFIO_PCI_MSI_IRQ_INDEX) + } + + fn enable_msix(&self, fds: Vec<&EventFd>) -> Result<(), VfioError> { + self.enable_irq(VFIO_PCI_MSIX_IRQ_INDEX, fds) + } + + fn disable_msix(&self) -> Result<(), VfioError> { + self.disable_irq(VFIO_PCI_MSIX_IRQ_INDEX) + } + + fn region_read(&self, _index: u32, _offset: u64, _data: &mut [u8]) { + unimplemented!() + } + + fn region_write(&self, _index: u32, _offset: u64, _data: &[u8]) { + unimplemented!() + } + + fn get_irq_info(&self, _irq_index: u32) -> Option { + unimplemented!() + } + + fn enable_irq(&self, _irq_index: u32, _event_fds: Vec<&EventFd>) -> Result<(), VfioError> { + unimplemented!() + } + + fn disable_irq(&self, _irq_index: u32) -> Result<(), VfioError> { + unimplemented!() + } + + fn unmask_irq(&self, _irq_index: u32) -> Result<(), VfioError> { + unimplemented!() } } -/// VfioPciDevice represents a VFIO PCI device. -/// This structure implements the BusDevice and PciDevice traits. -/// -/// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. -/// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, -/// which then gets added to the PCI bus. -pub struct VfioPciDevice { - vm: Arc>, +struct VfioDeviceWrapper { device: Arc, - container: Arc, - vfio_pci_configuration: VfioPciConfig, - configuration: PciConfiguration, - mmio_regions: Vec, - interrupt: Interrupt, - iommu_attached: bool, } -impl Debug for VfioPciDevice { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - f.debug_struct("VfioPciDevice") - .finish() +impl VfioDeviceWrapper { + fn new(device: Arc) -> Self { + Self { device } } } -impl VfioPciDevice { - /// Constructs a new Vfio Pci device for the given Vfio device - pub fn new( - vm: Arc>, - device: VfioDevice, - container: Arc, - msi_interrupt_manager: &Arc>, - legacy_interrupt_group: Option>>, - iommu_attached: bool, - ) -> Result { - let device = Arc::new(device); - device.reset(); +impl Vfio for VfioDeviceWrapper { + fn region_read(&self, index: u32, offset: u64, data: &mut [u8]) { + self.device.region_read(index, data, offset) + } + + fn region_write(&self, index: u32, offset: u64, data: &[u8]) { + self.device.region_write(index, data, offset) + } + + fn get_irq_info(&self, irq_index: u32) -> Option { + self.device.get_irq_info(irq_index).copied() + } + + fn enable_irq(&self, irq_index: u32, event_fds: Vec<&EventFd>) -> Result<(), VfioError> { + self.device + .enable_irq(irq_index, event_fds) + .map_err(VfioError::KernelVfio) + } + + fn disable_irq(&self, irq_index: u32) -> Result<(), VfioError> { + self.device + .disable_irq(irq_index) + .map_err(VfioError::KernelVfio) + } + + fn unmask_irq(&self, irq_index: u32) -> Result<(), VfioError> { + self.device + .unmask_irq(irq_index) + .map_err(VfioError::KernelVfio) + } +} + +#[derive(Serialize, Deserialize)] +struct VfioCommonState { + intx_state: Option, + msi_state: Option, + msix_state: Option, +} + +pub(crate) struct ConfigPatch { + mask: u32, + patch: u32, +} + +pub(crate) struct VfioCommon { + pub(crate) configuration: PciConfiguration, + pub(crate) mmio_regions: Vec, + pub(crate) interrupt: Interrupt, + pub(crate) msi_interrupt_manager: Arc>, + pub(crate) legacy_interrupt_group: Option>, + pub(crate) vfio_wrapper: Arc, + pub(crate) patches: HashMap, + x_nv_gpudirect_clique: Option, +} + +impl VfioCommon { + pub(crate) fn new( + msi_interrupt_manager: Arc>, + legacy_interrupt_group: Option>, + vfio_wrapper: Arc, + subclass: &dyn PciSubclass, + bdf: PciBdf, + x_nv_gpudirect_clique: Option, + ) -> Result { + let pci_configuration_state = None; let configuration = PciConfiguration::new( 0, 0, 0, PciClassCode::Other, - &PciVfioSubclass::VfioSubclass, + subclass, None, PciHeaderType::Device, 0, 0, None, + pci_configuration_state, ); - let vfio_pci_configuration = VfioPciConfig::new(Arc::clone(&device)); - - let mut vfio_pci_device = VfioPciDevice { - vm: Arc::clone(&vm), - device, - container, - configuration, - vfio_pci_configuration, + let mut vfio_common = VfioCommon { mmio_regions: Vec::new(), + configuration, interrupt: Interrupt { intx: None, msi: None, msix: None, }, - iommu_attached, + msi_interrupt_manager, + legacy_interrupt_group, + vfio_wrapper, + patches: HashMap::new(), + x_nv_gpudirect_clique, }; - vfio_pci_device.parse_capabilities(msi_interrupt_manager); - - vfio_pci_device.initialize_legacy_interrupt(legacy_interrupt_group)?; - - Ok(vfio_pci_device) - } - - fn enable_intx(&mut self) -> Result<()> { - if let Some(intx) = &mut self.interrupt.intx { - if !intx.enabled { - if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { - self.device - .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) - .map_err(VfioPciError::EnableIntx)?; + let state: Option = None; + let msi_state = None; + let msix_state = None; - intx.enabled = true; - } else { - return Err(VfioPciError::MissingNotifier); - } - } + if let Some(state) = state.as_ref() { + vfio_common.set_state(state, msi_state, msix_state)?; + } else { + vfio_common.parse_capabilities(bdf); + vfio_common.initialize_legacy_interrupt()?; } - Ok(()) + Ok(vfio_common) } - fn disable_intx(&mut self) { - if let Some(intx) = &mut self.interrupt.intx { - if intx.enabled { - if let Err(e) = self.device.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { - error!("Could not disable INTx: {}", e); - } else { - intx.enabled = false; - } - } - } - } + /// In case msix table offset is not page size aligned, we need do some fixup to achieve it. + /// Because we don't want the MMIO RW region and trap region overlap each other. + fn fixup_msix_region(&mut self, bar_id: u32, region_size: u64) -> u64 { + if let Some(msix) = self.interrupt.msix.as_mut() { + let msix_cap = &mut msix.cap; - fn enable_msi(&self) -> Result<()> { - if let Some(msi) = &self.interrupt.msi { - let mut irq_fds: Vec = Vec::new(); - for i in 0..msi.cfg.num_enabled_vectors() { - if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { - irq_fds.push(eventfd); - } else { - return Err(VfioPciError::MissingNotifier); - } + // Suppose table_bir equals to pba_bir here. Am I right? + let (table_offset, table_size) = msix_cap.table_range(); + if is_page_size_aligned(table_offset) || msix_cap.table_bir() != bar_id { + return region_size; } - self.device - .enable_msi(irq_fds.iter().collect()) - .map_err(VfioPciError::EnableMsi)?; - } - - Ok(()) - } + let (pba_offset, pba_size) = msix_cap.pba_range(); + let msix_sz = align_page_size_up(table_size + pba_size); + // Expand region to hold RW and trap region which both page size aligned + let size = std::cmp::max(region_size * 2, msix_sz * 2); + // let table starts from the middle of the region + msix_cap.table_set_offset((size / 2) as u32); + msix_cap.pba_set_offset((size / 2 + pba_offset - table_offset) as u32); - fn disable_msi(&self) { - if let Err(e) = self.device.disable_msi() { - error!("Could not disable MSI: {}", e); + size + } else { + // MSI-X not supported for this device + region_size } } - fn enable_msix(&self) -> Result<()> { - if let Some(msix) = &self.interrupt.msix { - let mut irq_fds: Vec = Vec::new(); - for i in 0..msix.bar.table_entries.len() { - if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { - irq_fds.push(eventfd); - } else { - return Err(VfioPciError::MissingNotifier); + // The `allocator` argument is unused on `aarch64` + #[allow(unused_variables)] + pub(crate) fn allocate_bars( + &mut self, + allocator: &Arc>, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> Result, PciDeviceError> { + let mut bars = Vec::new(); + let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX; + + // Going through all regular regions to compute the BAR size. + // We're not saving the BAR address to restore it, because we + // are going to allocate a guest address for each BAR and write + // that new address back. + while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { + let mut region_size: u64 = 0; + let mut region_type = PciBarRegionType::Memory32BitRegion; + let mut prefetchable = PciBarPrefetchable::NotPrefetchable; + let mut flags: u32 = 0; + + let mut restored_bar_addr = None; + if let Some(resources) = &resources { + for resource in resources { + if let Resource::PciBar { + index, + base, + size, + type_, + .. + } = resource + { + if *index == bar_id as usize { + restored_bar_addr = Some(GuestAddress(*base)); + region_size = *size; + region_type = PciBarRegionType::from(*type_); + break; + } + } } - } + if restored_bar_addr.is_none() { + bar_id += 1; + continue; + } + } else { + let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { + (PCI_ROM_EXP_BAR_INDEX * 4) as u32 + } else { + PCI_CONFIG_BAR_OFFSET + bar_id * 4 + }; - self.device - .enable_msix(irq_fds.iter().collect()) - .map_err(VfioPciError::EnableMsix)?; - } + // First read flags + flags = self.vfio_wrapper.read_config_dword(bar_offset); - Ok(()) - } + // Is this an IO BAR? + let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { + matches!(flags & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) + } else { + false + }; - fn disable_msix(&self) { - if let Err(e) = self.device.disable_msix() { - error!("Could not disable MSI-X: {}", e); - } - } + // Is this a 64-bit BAR? + let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { + matches!( + flags & PCI_CONFIG_MEMORY_BAR_64BIT, + PCI_CONFIG_MEMORY_BAR_64BIT + ) + } else { + false + }; - fn initialize_legacy_interrupt( - &mut self, - legacy_interrupt_group: Option>>, - ) -> Result<()> { - if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { - if irq_info.count == 0 { - error!("Device does not want legacy IRQ"); - // A count of 0 means the INTx IRQ is not supported, therefore - // it shouldn't be initialized. - return Ok(()); - } - } - if let Some(interrupt_source_group) = legacy_interrupt_group { - self.interrupt.intx = Some( - VfioIntx { - interrupt_source_group, - enabled: false, - }); - } + if matches!( + flags & PCI_CONFIG_BAR_PREFETCHABLE, + PCI_CONFIG_BAR_PREFETCHABLE + ) { + prefetchable = PciBarPrefetchable::Prefetchable + }; - self.enable_intx()?; + // To get size write all 1s + self.vfio_wrapper + .write_config_dword(bar_offset, 0xffff_ffff); - Ok(()) - } + // And read back BAR value. The device will write zeros for bits it doesn't care about + let mut lower = self.vfio_wrapper.read_config_dword(bar_offset); - fn parse_msix_capabilities( - &mut self, - cap: u8, - interrupt_manager: &Arc>, - ) { - let msg_ctl = self - .vfio_pci_configuration - .read_config_word((cap + 2).into()); + if io_bar { + // Mask flag bits (lowest 2 for I/O bars) + lower &= !0b11; - let table = self - .vfio_pci_configuration - .read_config_dword((cap + 4).into()); + // BAR is not enabled + if lower == 0 { + bar_id += 1; + continue; + } - let pba = self - .vfio_pci_configuration - .read_config_dword((cap + 8).into()); + // IO BAR + region_type = PciBarRegionType::IoRegion; - let msix_cap = MsixCap { - msg_ctl, - table, - pba, - }; + // Invert bits and add 1 to calculate size + region_size = (!lower + 1) as u64; + } else if is_64bit_bar { + // 64 bits Memory BAR + region_type = PciBarRegionType::Memory64BitRegion; - let interrupt_source_group = interrupt_manager - .create_group(MsiIrqGroupConfig { - base: 0, - count: msix_cap.table_size() as InterruptIndex, - }) - .unwrap(); + // Query size of upper BAR of 64-bit BAR + let upper_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; + self.vfio_wrapper + .write_config_dword(upper_offset, 0xffff_ffff); + let upper = self.vfio_wrapper.read_config_dword(upper_offset); - let msix_config = MsixConfig::new(msix_cap.table_size(), interrupt_source_group.clone(), 0); + let mut combined_size = u64::from(upper) << 32 | u64::from(lower); - self.interrupt.msix = Some(VfioMsix { - bar: msix_config, - cap: msix_cap, - cap_offset: cap.into(), - interrupt_source_group, - }); - } + // Mask out flag bits (lowest 4 for memory bars) + combined_size &= !0b1111; - fn parse_msi_capabilities( - &mut self, - cap: u8, - interrupt_manager: &Arc>, - ) { - let msg_ctl = self - .vfio_pci_configuration - .read_config_word((cap + 2).into()); + // BAR is not enabled + if combined_size == 0 { + bar_id += 1; + continue; + } - let interrupt_source_group = interrupt_manager - .create_group(MsiIrqGroupConfig { - base: 0, - count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, - }) - .unwrap(); + // Invert and add 1 to to find size + region_size = !combined_size + 1; + } else { + region_type = PciBarRegionType::Memory32BitRegion; - let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone()); + // Mask out flag bits (lowest 4 for memory bars) + lower &= !0b1111; - self.interrupt.msi = Some(VfioMsi { - cfg: msi_config, - cap_offset: cap.into(), + if lower == 0 { + bar_id += 1; + continue; + } + + // Invert and add 1 to to find size + region_size = (!lower + 1) as u64; + } + } + + let bar_addr = match region_type { + PciBarRegionType::IoRegion => { + #[cfg(not(target_arch = "x86_64"))] + unimplemented!(); + + // The address needs to be 4 bytes aligned. + #[cfg(target_arch = "x86_64")] + allocator + .lock() + .unwrap() + .allocate_io_addresses(restored_bar_addr, region_size, Some(0x4)) + .ok_or(PciDeviceError::IoAllocationFailed(region_size))? + } + PciBarRegionType::Memory32BitRegion => { + // BAR allocation must be naturally aligned + mmio32_allocator + .allocate(restored_bar_addr, region_size, Some(region_size)) + .ok_or(PciDeviceError::IoAllocationFailed(region_size))? + } + PciBarRegionType::Memory64BitRegion => { + // We need do some fixup to keep MMIO RW region and msix cap region page size + // aligned. + region_size = self.fixup_msix_region(bar_id, region_size); + mmio64_allocator + .allocate( + restored_bar_addr, + region_size, + Some(std::cmp::max( + // SAFETY: FFI call. Trivially safe. + unsafe { sysconf(_SC_PAGESIZE) as GuestUsize }, + region_size, + )), + ) + .ok_or(PciDeviceError::IoAllocationFailed(region_size))? + } + }; + + // We can now build our BAR configuration block. + let bar = PciBarConfiguration::default() + .set_index(bar_id as usize) + .set_address(bar_addr.raw_value()) + .set_size(region_size) + .set_region_type(region_type) + .set_prefetchable(prefetchable); + + if bar_id == VFIO_PCI_ROM_REGION_INDEX { + self.configuration + .add_pci_rom_bar(&bar, flags & 0x1) + .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; + } else { + self.configuration + .add_pci_bar(&bar) + .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; + } + + bars.push(bar); + self.mmio_regions.push(MmioRegion { + start: bar_addr, + length: region_size, + type_: region_type, + index: bar_id, + user_memory_regions: Vec::new(), + }); + + bar_id += 1; + if region_type == PciBarRegionType::Memory64BitRegion { + bar_id += 1; + } + } + + Ok(bars) + } + + // The `allocator` argument is unused on `aarch64` + #[allow(unused_variables)] + pub(crate) fn free_bars( + &mut self, + allocator: &mut SystemAllocator, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> Result<(), PciDeviceError> { + for region in self.mmio_regions.iter() { + match region.type_ { + PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] + allocator.free_io_addresses(region.start, region.length); + #[cfg(not(target_arch = "x86_64"))] + error!("I/O region is not supported"); + } + PciBarRegionType::Memory32BitRegion => { + mmio32_allocator.free(region.start, region.length); + } + PciBarRegionType::Memory64BitRegion => { + mmio64_allocator.free(region.start, region.length); + } + } + } + Ok(()) + } + + pub(crate) fn parse_msix_capabilities(&mut self, cap: u8) -> MsixCap { + let msg_ctl = self.vfio_wrapper.read_config_word((cap + 2).into()); + + let table = self.vfio_wrapper.read_config_dword((cap + 4).into()); + + let pba = self.vfio_wrapper.read_config_dword((cap + 8).into()); + + MsixCap { + msg_ctl, + table, + pba, + } + } + + pub(crate) fn initialize_msix( + &mut self, + msix_cap: MsixCap, + cap_offset: u32, + bdf: PciBdf, + state: Option, + ) { + let interrupt_source_group = self + .msi_interrupt_manager + .create_group(MsiIrqGroupConfig { + base: 0, + count: msix_cap.table_size() as InterruptIndex, + }) + .unwrap(); + + let msix_config = MsixConfig::new( + msix_cap.table_size(), + interrupt_source_group.clone(), + bdf.into(), + state, + ) + .unwrap(); + + self.interrupt.msix = Some(VfioMsix { + bar: msix_config, + cap: msix_cap, + cap_offset, interrupt_source_group, }); } - fn parse_capabilities( + pub(crate) fn parse_msi_capabilities(&mut self, cap: u8) -> u16 { + self.vfio_wrapper.read_config_word((cap + 2).into()) + } + + pub(crate) fn initialize_msi( &mut self, - interrupt_manager: &Arc>, + msg_ctl: u16, + cap_offset: u32, + state: Option, ) { + let interrupt_source_group = self + .msi_interrupt_manager + .create_group(MsiIrqGroupConfig { + base: 0, + count: msi_num_enabled_vectors(msg_ctl) as InterruptIndex, + }) + .unwrap(); + + let msi_config = MsiConfig::new(msg_ctl, interrupt_source_group.clone(), state).unwrap(); + + self.interrupt.msi = Some(VfioMsi { + cfg: msi_config, + cap_offset, + interrupt_source_group, + }); + } + + pub(crate) fn get_msix_cap_idx(&self) -> Option { let mut cap_next = self - .vfio_pci_configuration + .vfio_wrapper .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); while cap_next != 0 { - let cap_id = self - .vfio_pci_configuration - .read_config_byte(cap_next.into()); + let cap_id = self.vfio_wrapper.read_config_byte(cap_next.into()); + if PciCapabilityId::from(cap_id) == PciCapabilityId::MsiX { + return Some(cap_next as usize); + } else { + cap_next = self.vfio_wrapper.read_config_byte((cap_next + 1).into()); + } + } + + None + } + + pub(crate) fn parse_capabilities(&mut self, bdf: PciBdf) { + let mut cap_iter = self + .vfio_wrapper + .read_config_byte(PCI_CONFIG_CAPABILITY_OFFSET); + + let mut pci_express_cap_found = false; + let mut power_management_cap_found = false; + + while cap_iter != 0 { + let cap_id = self.vfio_wrapper.read_config_byte(cap_iter.into()); match PciCapabilityId::from(cap_id) { PciCapabilityId::MessageSignalledInterrupts => { - if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { + if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { if irq_info.count > 0 { // Parse capability only if the VFIO device // supports MSI. - self.parse_msi_capabilities(cap_next, interrupt_manager); + let msg_ctl = self.parse_msi_capabilities(cap_iter); + self.initialize_msi(msg_ctl, cap_iter as u32, None); } } } PciCapabilityId::MsiX => { - if let Some(irq_info) = self.device.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) { + if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) + { if irq_info.count > 0 { // Parse capability only if the VFIO device // supports MSI-X. - self.parse_msix_capabilities(cap_next, interrupt_manager); + let msix_cap = self.parse_msix_capabilities(cap_iter); + self.initialize_msix(msix_cap, cap_iter as u32, bdf, None); } } } + PciCapabilityId::PciExpress => pci_express_cap_found = true, + PciCapabilityId::PowerManagement => power_management_cap_found = true, _ => {} }; - cap_next = self - .vfio_pci_configuration - .read_config_byte((cap_next + 1).into()); + let cap_next = self.vfio_wrapper.read_config_byte((cap_iter + 1).into()); + if cap_next == 0 { + break; + } + + cap_iter = cap_next; + } + + if let Some(clique_id) = self.x_nv_gpudirect_clique { + self.add_nv_gpudirect_clique_cap(cap_iter, clique_id); + } + + if pci_express_cap_found && power_management_cap_found { + self.parse_extended_capabilities(); } } - fn update_msi_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<()> { + fn add_nv_gpudirect_clique_cap(&mut self, cap_iter: u8, clique_id: u8) { + // Turing, Ampere, Hopper, and Lovelace GPUs have dedicated space + // at 0xD4 for this capability. + let cap_offset = 0xd4u32; + + let reg_idx = (cap_iter / 4) as usize; + self.patches.insert( + reg_idx, + ConfigPatch { + mask: 0x0000_ff00, + patch: cap_offset << 8, + }, + ); + + let reg_idx = (cap_offset / 4) as usize; + self.patches.insert( + reg_idx, + ConfigPatch { + mask: 0xffff_ffff, + patch: 0x50080009u32, + }, + ); + self.patches.insert( + reg_idx + 1, + ConfigPatch { + mask: 0xffff_ffff, + patch: u32::from(clique_id) << 19 | 0x5032, + }, + ); + } + + fn parse_extended_capabilities(&mut self) { + let mut current_offset = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; + + loop { + let ext_cap_hdr = self.vfio_wrapper.read_config_dword(current_offset); + + let cap_id: u16 = (ext_cap_hdr & 0xffff) as u16; + let cap_next: u16 = ((ext_cap_hdr >> 20) & 0xfff) as u16; + + match PciExpressCapabilityId::from(cap_id) { + PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation + | PciExpressCapabilityId::ResizeableBar + | PciExpressCapabilityId::SingleRootIoVirtualization => { + let reg_idx = (current_offset / 4) as usize; + self.patches.insert( + reg_idx, + ConfigPatch { + mask: 0x0000_ffff, + patch: PciExpressCapabilityId::NullCapability as u32, + }, + ); + } + _ => {} + } + + if cap_next == 0 { + break; + } + + current_offset = cap_next.into(); + } + } + + pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { + if let Some(intx) = &mut self.interrupt.intx { + if !intx.enabled { + if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { + self.vfio_wrapper + .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) + .map_err(VfioPciError::EnableIntx)?; + + intx.enabled = true; + } else { + return Err(VfioPciError::MissingNotifier); + } + } + } + + Ok(()) + } + + pub(crate) fn disable_intx(&mut self) { + if let Some(intx) = &mut self.interrupt.intx { + if intx.enabled { + if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { + error!("Could not disable INTx: {}", e); + } else { + intx.enabled = false; + } + } + } + } + + pub(crate) fn enable_msi(&self) -> Result<(), VfioPciError> { + if let Some(msi) = &self.interrupt.msi { + let mut irq_fds: Vec = Vec::new(); + for i in 0..msi.cfg.num_enabled_vectors() { + if let Some(eventfd) = msi.interrupt_source_group.notifier(i as InterruptIndex) { + irq_fds.push(eventfd); + } else { + return Err(VfioPciError::MissingNotifier); + } + } + + self.vfio_wrapper + .enable_msi(irq_fds.iter().collect()) + .map_err(VfioPciError::EnableMsi)?; + } + + Ok(()) + } + + pub(crate) fn disable_msi(&self) { + if let Err(e) = self.vfio_wrapper.disable_msi() { + error!("Could not disable MSI: {}", e); + } + } + + pub(crate) fn enable_msix(&self) -> Result<(), VfioPciError> { + if let Some(msix) = &self.interrupt.msix { + let mut irq_fds: Vec = Vec::new(); + for i in 0..msix.bar.table_entries.len() { + if let Some(eventfd) = msix.interrupt_source_group.notifier(i as InterruptIndex) { + irq_fds.push(eventfd); + } else { + return Err(VfioPciError::MissingNotifier); + } + } + + self.vfio_wrapper + .enable_msix(irq_fds.iter().collect()) + .map_err(VfioPciError::EnableMsix)?; + } + + Ok(()) + } + + pub(crate) fn disable_msix(&self) { + if let Err(e) = self.vfio_wrapper.disable_msix() { + error!("Could not disable MSI-X: {}", e); + } + } + + pub(crate) fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { + if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { + if irq_info.count == 0 { + // A count of 0 means the INTx IRQ is not supported, therefore + // it shouldn't be initialized. + return Ok(()); + } + } + + if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { + self.interrupt.intx = Some(VfioIntx { + interrupt_source_group, + enabled: false, + }); + + self.enable_intx()?; + } + + Ok(()) + } + + pub(crate) fn update_msi_capabilities( + &mut self, + offset: u64, + data: &[u8], + ) -> Result<(), VfioPciError> { match self.interrupt.update_msi(offset, data) { Some(InterruptUpdateAction::EnableMsi) => { // Disable INTx before we can enable MSI @@ -594,14 +1115,16 @@ impl VfioPciDevice { Ok(()) } - fn update_msix_capabilities(&mut self, offset: u64, data: &[u8]) -> Result<()> { + pub(crate) fn update_msix_capabilities( + &mut self, + offset: u64, + data: &[u8], + ) -> Result<(), VfioPciError> { match self.interrupt.update_msix(offset, data) { Some(InterruptUpdateAction::EnableMsix) => { // Disable INTx before we can enable MSI-X self.disable_intx(); self.enable_msix()?; - - error!("MSIX enabled.") } Some(InterruptUpdateAction::DisableMsix) => { // Fallback onto INTx when disabling MSI-X @@ -614,13 +1137,12 @@ impl VfioPciDevice { Ok(()) } - fn find_region(&self, addr: u64) -> Option { + pub(crate) fn find_region(&self, addr: u64) -> Option { for region in self.mmio_regions.iter() { - // error!("Finding region {:x} vs {:x} len {:x}", addr, region.start.raw_value(), region.length); if addr >= region.start.raw_value() && addr < region.start.unchecked_add(region.length).raw_value() { - return Some(*region); + return Some(region.clone()); } } None @@ -648,6 +1170,361 @@ impl VfioPciDevice { }, } } + + pub(crate) fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { + let addr = base + offset; + if let Some(region) = self.find_region(addr) { + let offset = addr - region.start.raw_value(); + + if self.interrupt.msix_table_accessed(region.index, offset) { + self.interrupt.msix_read_table(offset, data); + } else { + self.vfio_wrapper.region_read(region.index, offset, data); + } + } + + // INTx EOI + // The guest reading from the BAR potentially means the interrupt has + // been received and can be acknowledged. + if self.interrupt.intx_in_use() { + if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { + error!("Failed unmasking INTx IRQ: {}", e); + } + } + } + + pub(crate) fn write_bar( + &mut self, + base: u64, + offset: u64, + data: &[u8], + ) -> Option> { + let addr = base + offset; + if let Some(region) = self.find_region(addr) { + let offset = addr - region.start.raw_value(); + + // If the MSI-X table is written to, we need to update our cache. + if self.interrupt.msix_table_accessed(region.index, offset) { + self.interrupt.msix_write_table(offset, data); + } else { + self.vfio_wrapper.region_write(region.index, offset, data); + } + } + + // INTx EOI + // The guest writing to the BAR potentially means the interrupt has + // been received and can be acknowledged. + if self.interrupt.intx_in_use() { + if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { + error!("Failed unmasking INTx IRQ: {}", e); + } + } + + None + } + + pub(crate) fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + // When the guest wants to write to a BAR, we trap it into + // our local configuration space. We're not reprogramming + // VFIO device. + if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) + || reg_idx == PCI_ROM_EXP_BAR_INDEX + { + // We keep our local cache updated with the BARs. + // We'll read it back from there when the guest is asking + // for BARs (see read_config_register()). + self.configuration + .write_config_register(reg_idx, offset, data); + return None; + } + + let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; + + // If the MSI or MSI-X capabilities are accessed, we need to + // update our local cache accordingly. + // Depending on how the capabilities are modified, this could + // trigger a VFIO MSI or MSI-X toggle. + if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { + let cap_offset: u64 = reg - cap_base + offset; + match cap_id { + PciCapabilityId::MessageSignalledInterrupts => { + if let Err(e) = self.update_msi_capabilities(cap_offset, data) { + error!("Could not update MSI capabilities: {}", e); + } + } + PciCapabilityId::MsiX => { + if let Err(e) = self.update_msix_capabilities(cap_offset, data) { + error!("Could not update MSI-X capabilities: {}", e); + } + } + _ => {} + } + } + + // Make sure to write to the device's PCI config space after MSI/MSI-X + // interrupts have been enabled/disabled. In case of MSI, when the + // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), + // the MSI Enable bit in the MSI capability structure found in the PCI + // config space is disabled by default. That's why when the guest is + // enabling this bit, we first need to enable the MSI interrupts with + // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write + // to the device region to update the MSI Enable bit. + self.vfio_wrapper.write_config((reg + offset) as u32, data); + + None + } + + pub(crate) fn read_config_register(&mut self, reg_idx: usize) -> u32 { + // When reading the BARs, we trap it and return what comes + // from our local configuration space. We want the guest to + // use that and not the VFIO device BARs as it does not map + // with the guest address space. + if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) + || reg_idx == PCI_ROM_EXP_BAR_INDEX + { + return self.configuration.read_reg(reg_idx); + } + + if let Some(id) = self.get_msix_cap_idx() { + let msix = self.interrupt.msix.as_mut().unwrap(); + if reg_idx * 4 == id + 4 { + return msix.cap.table; + } else if reg_idx * 4 == id + 8 { + return msix.cap.pba; + } + } + + // Since we don't support passing multi-functions devices, we should + // mask the multi-function bit, bit 7 of the Header Type byte on the + // register 3. + let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { + 0xff7f_ffff + } else { + 0xffff_ffff + }; + + // The config register read comes from the VFIO device itself. + let mut value = self.vfio_wrapper.read_config_dword((reg_idx * 4) as u32) & mask; + + if let Some(config_patch) = self.patches.get(®_idx) { + value = (value & !config_patch.mask) | config_patch.patch; + } + + value + } + + fn state(&self) -> VfioCommonState { + let intx_state = self.interrupt.intx.as_ref().map(|intx| IntxState { + enabled: intx.enabled, + }); + + let msi_state = self.interrupt.msi.as_ref().map(|msi| MsiState { + cap: msi.cfg.cap, + cap_offset: msi.cap_offset, + }); + + let msix_state = self.interrupt.msix.as_ref().map(|msix| MsixState { + cap: msix.cap, + cap_offset: msix.cap_offset, + bdf: msix.bar.devid, + }); + + VfioCommonState { + intx_state, + msi_state, + msix_state, + } + } + + fn set_state( + &mut self, + state: &VfioCommonState, + msi_state: Option, + msix_state: Option, + ) -> Result<(), VfioPciError> { + if let (Some(intx), Some(interrupt_source_group)) = + (&state.intx_state, self.legacy_interrupt_group.clone()) + { + self.interrupt.intx = Some(VfioIntx { + interrupt_source_group, + enabled: false, + }); + + if intx.enabled { + self.enable_intx()?; + } + } + + if let Some(msi) = &state.msi_state { + self.initialize_msi(msi.cap.msg_ctl, msi.cap_offset, msi_state); + } + + if let Some(msix) = &state.msix_state { + self.initialize_msix(msix.cap, msix.cap_offset, msix.bdf.into(), msix_state); + } + + Ok(()) + } +} + +/// VfioPciDevice represents a VFIO PCI device. +/// This structure implements the BusDevice and PciDevice traits. +/// +/// A VfioPciDevice is bound to a VfioDevice and is also a PCI device. +/// The VMM creates a VfioDevice, then assigns it to a VfioPciDevice, +/// which then gets added to the PCI bus. +pub struct VfioPciDevice { + id: String, + vm: Arc>, + device: Arc, + container: Arc, + common: VfioCommon, + iommu_attached: bool, + memory_slot: Arc u32 + Send + Sync>, +} + +impl Debug for VfioPciDevice { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.debug_struct("VfioPciDevice") + .finish() + } +} + +impl VfioPciDevice { + /// Constructs a new Vfio Pci device for the given Vfio device + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + vm: Arc>, + device: VfioDevice, + container: Arc, + msi_interrupt_manager: Arc>, + legacy_interrupt_group: Option>, + iommu_attached: bool, + bdf: PciBdf, + memory_slot: Arc u32 + Send + Sync>, + x_nv_gpudirect_clique: Option, + ) -> Result { + let device = Arc::new(device); + device.reset(); + + let vfio_wrapper = VfioDeviceWrapper::new(Arc::clone(&device)); + + let common = VfioCommon::new( + msi_interrupt_manager, + legacy_interrupt_group, + Arc::new(vfio_wrapper) as Arc, + &PciVfioSubclass::VfioSubclass, + bdf, + x_nv_gpudirect_clique, + )?; + + let vfio_pci_device = VfioPciDevice { + id, + vm: vm.clone(), + device, + container, + common, + iommu_attached, + memory_slot, + }; + + Ok(vfio_pci_device) + } + + pub fn iommu_attached(&self) -> bool { + self.iommu_attached + } + + fn generate_sparse_areas( + caps: &[VfioRegionInfoCap], + region_index: u32, + region_start: u64, + region_size: u64, + vfio_msix: Option<&VfioMsix>, + ) -> Result, VfioPciError> { + for cap in caps { + match cap { + VfioRegionInfoCap::SparseMmap(sparse_mmap) => return Ok(sparse_mmap.areas.clone()), + VfioRegionInfoCap::MsixMappable => { + if !is_4k_aligned(region_start) { + error!( + "Region start address 0x{:x} must be at least aligned on 4KiB", + region_start + ); + return Err(VfioPciError::RegionAlignment); + } + if !is_4k_multiple(region_size) { + error!( + "Region size 0x{:x} must be at least a multiple of 4KiB", + region_size + ); + return Err(VfioPciError::RegionSize); + } + + // In case the region contains the MSI-X vectors table or + // the MSI-X PBA table, we must calculate the subregions + // around them, leading to a list of sparse areas. + // We want to make sure we will still trap MMIO accesses + // to these MSI-X specific ranges. If these region don't align + // with pagesize, we can achieve it by enlarging its range. + // + // Using a BtreeMap as the list provided through the iterator is sorted + // by key. This ensures proper split of the whole region. + let mut inter_ranges = BTreeMap::new(); + if let Some(msix) = vfio_msix { + if region_index == msix.cap.table_bir() { + let (offset, size) = msix.cap.table_range(); + let offset = align_page_size_down(offset); + let size = align_page_size_up(size); + inter_ranges.insert(offset, size); + } + if region_index == msix.cap.pba_bir() { + let (offset, size) = msix.cap.pba_range(); + let offset = align_page_size_down(offset); + let size = align_page_size_up(size); + inter_ranges.insert(offset, size); + } + } + + let mut sparse_areas = Vec::new(); + let mut current_offset = 0; + for (range_offset, range_size) in inter_ranges { + if range_offset > current_offset { + sparse_areas.push(VfioRegionSparseMmapArea { + offset: current_offset, + size: range_offset - current_offset, + }); + } + current_offset = align_page_size_down(range_offset + range_size); + } + + if region_size > current_offset { + sparse_areas.push(VfioRegionSparseMmapArea { + offset: current_offset, + size: region_size - current_offset, + }); + } + + return Ok(sparse_areas); + } + _ => {} + } + } + + // In case no relevant capabilities have been found, create a single + // sparse area corresponding to the entire MMIO region. + Ok(vec![VfioRegionSparseMmapArea { + offset: 0, + size: region_size, + }]) + } + /// Map MMIO regions into the guest, and avoid VM exits when the guest tries /// to reach those regions. /// @@ -656,21 +1533,11 @@ impl VfioPciDevice { /// * `vm` - The VM object. It is used to set the VFIO MMIO regions /// as user memory regions. /// * `mem_slot` - The closure to return a memory slot. - pub fn map_mmio_regions(&mut self) -> Result<()> { + pub fn map_mmio_regions(&mut self) -> Result<(), VfioPciError> { let fd = self.device.as_raw_fd(); - let mut slot = 2; - - error!("Mmap mmio regions count {}", self.mmio_regions.len()); - for region in self.mmio_regions.iter_mut() { - // We want to skip the mapping of the BAR containing the MSI-X - // table even if it is mappable. The reason is we need to trap - // any access to the MSI-X table and update the GSI routing - // accordingly. - if let Some(msix) = &self.interrupt.msix { - if region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir() { - continue; - } - } + + + for region in self.common.mmio_regions.iter_mut() { let region_flags = self.device.get_region_flags(region.index); if region_flags & VFIO_REGION_INFO_FLAG_MMAP != 0 { let mut prot = 0; @@ -680,68 +1547,102 @@ impl VfioPciDevice { if region_flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { prot |= libc::PROT_WRITE; } - let mmap_offset = self.device.get_region_offset(region.index); - let mmap_size = self.device.get_region_size(region.index); - - let offset = self.device.get_region_offset(region.index) + mmap_offset; - error!( - "VFIO region {}, offset {:x}, size {:x}", - region.index, offset, mmap_size - ); - let host_addr = unsafe { - libc::mmap( - null_mut(), - mmap_size as usize, - prot, - libc::MAP_SHARED, - fd, - offset as libc::off_t, - ) + + // Retrieve the list of capabilities found on the region + let caps = if region_flags & VFIO_REGION_INFO_FLAG_CAPS != 0 { + self.device.get_region_caps(region.index) + } else { + Vec::new() }; - if host_addr == libc::MAP_FAILED { - error!( - "Could not mmap regions, error:{}", - io::Error::last_os_error() - ); - continue; + // Don't try to mmap the region if it contains MSI-X table or + // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE + // in the list of supported capabilities. + if let Some(msix) = self.common.interrupt.msix.as_ref() { + if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) + && !caps.contains(&VfioRegionInfoCap::MsixMappable) + { + continue; + } } - error!( - "Mmap slot {} gpa {:x} size {} hva {:x}", - slot, - region.start.raw_value() + mmap_offset, - mmap_size as u64, - host_addr as u64 - ); + let mmap_size = self.device.get_region_size(region.index); + let mmap_offset = self.device.get_region_offset(region.index); - let mem_region = Self::make_user_memory_region( - slot, - region.start.raw_value() + mmap_offset, - mmap_size as u64, - host_addr as u64, - false, - false, - ); + let sparse_areas = Self::generate_sparse_areas( + &caps, + region.index, + region.start.0, + mmap_size, + self.common.interrupt.msix.as_ref(), + )?; + + for area in sparse_areas.iter() { + // SAFETY: FFI call with correct arguments + let host_addr = unsafe { + libc::mmap( + null_mut(), + area.size as usize, + prot, + libc::MAP_SHARED, + fd, + mmap_offset as libc::off_t + area.offset as libc::off_t, + ) + }; + + if host_addr == libc::MAP_FAILED { + error!( + "Could not mmap sparse area (offset = 0x{:x}, size = 0x{:x}): {}", + area.offset, + area.size, + std::io::Error::last_os_error() + ); + return Err(VfioPciError::MmapArea); + } - unsafe { - self.vm.lock().expect("Poisoned lock") - .set_user_memory_region(mem_region) - .map_err(|e| VfioPciError::MapRegionGuest(e.into()))?; - } + if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { + warn!( + "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})", + area.offset, + area.size, + ); + return Ok(()); + } - // self.container.vfio_dma_map( - // region.start.raw_value() + mmap_offset, - // mmap_size, - // host_addr as u64 - // ).unwrap(); + let user_memory_region = UserMemoryRegion { + slot: (self.memory_slot)(), + start: region.start.0 + area.offset, + size: area.size, + host_addr: host_addr as u64, + }; + + region.user_memory_regions.push(user_memory_region); + + let mem_region = VfioCommon::make_user_memory_region( + user_memory_region.slot, + user_memory_region.start, + user_memory_region.size, + user_memory_region.host_addr, + false, + false, + ); - // Update the region with memory mapped info. - region.mem_slot = Some(slot); - region.host_addr = Some(host_addr as u64); - region.mmap_size = Some(mmap_size as usize); + unsafe { + self.vm.lock().expect("Poisoned lock") + .set_user_memory_region(mem_region) + .map_err(|e| VfioPciError::MapRegionGuest(e.into()))?; + } - slot += 1; + if !self.iommu_attached { + self.container + .vfio_dma_map( + user_memory_region.start, + user_memory_region.size, + user_memory_region.host_addr, + ) + .map_err(VfioPciError::DmaMap)?; + } + } } } @@ -749,18 +1650,24 @@ impl VfioPciDevice { } pub fn unmap_mmio_regions(&mut self) { - for region in self.mmio_regions.iter() { - if let (Some(host_addr), Some(mmap_size), Some(mem_slot)) = - (region.host_addr, region.mmap_size, region.mem_slot) - { - let mmap_offset = self.device.get_region_offset(region.index); + for region in self.common.mmio_regions.iter() { + for user_memory_region in region.user_memory_regions.iter() { + // Unmap from vfio container + if !self.iommu_attached { + if let Err(e) = self + .container + .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) + { + error!("Could not unmap mmio region from vfio container: {}", e); + } + } // Remove region - let r = Self::make_user_memory_region( - mem_slot, - region.start.raw_value() + mmap_offset, - 0, - host_addr as u64, + let r = VfioCommon::make_user_memory_region( + user_memory_region.slot, + user_memory_region.start, + user_memory_region.size, + user_memory_region.host_addr, false, false, ); @@ -769,7 +1676,13 @@ impl VfioPciDevice { error!("Could not remove the userspace memory region: {}", e); } - let ret = unsafe { libc::munmap(host_addr as *mut libc::c_void, mmap_size) }; + // SAFETY: FFI call with correct arguments + let ret = unsafe { + libc::munmap( + user_memory_region.host_addr as *mut libc::c_void, + user_memory_region.size as usize, + ) + }; if ret != 0 { error!( "Could not unmap region {}, error:{}", @@ -781,16 +1694,17 @@ impl VfioPciDevice { } } - pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<()> { + pub fn dma_map(&self, iova: u64, size: u64, user_addr: u64) -> Result<(), VfioPciError> { if !self.iommu_attached { self.container .vfio_dma_map(iova, size, user_addr) .map_err(VfioPciError::DmaMap)?; } + Ok(()) } - pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<()> { + pub fn dma_unmap(&self, iova: u64, size: u64) -> Result<(), VfioPciError> { if !self.iommu_attached { self.container .vfio_dma_unmap(iova, size) @@ -801,15 +1715,7 @@ impl VfioPciDevice { } pub fn mmio_regions(&self) -> Vec { - self.mmio_regions.clone() - } - - pub fn bus_read(&mut self, base: u64, offset: u64, data: &mut [u8]) { - self.read_bar(base, offset, data); - } - - pub fn bus_write(&mut self, base: u64, offset: u64, data: &[u8]) { - self.write_bar(base, offset, data); + self.common.mmio_regions.clone() } } @@ -817,23 +1723,47 @@ impl Drop for VfioPciDevice { fn drop(&mut self) { self.unmap_mmio_regions(); - if self.interrupt.intx_in_use() { - self.disable_intx(); + if let Some(msix) = &self.common.interrupt.msix { + if msix.bar.enabled() { + self.common.disable_msix(); + } + } + + if let Some(msi) = &self.common.interrupt.msi { + if msi.cfg.enabled() { + self.common.disable_msi() + } + } + + if self.common.interrupt.intx_in_use() { + self.common.disable_intx(); } } } +impl VfioPciDevice { + pub fn bus_read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + pub fn bus_write(&mut self, base: u64, offset: u64, data: &[u8]) { + self.write_bar(base, offset, data); + () + } +} // First BAR offset in the PCI config space. const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; // Capability register offset in the PCI config space. const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; +// Extended capabilities register offset in the PCI config space. +const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u32 = 0x100; // IO BAR when first BAR bit is 1. const PCI_CONFIG_IO_BAR: u32 = 0x1; -// Memory BAR flags (lower 4 bits). -const PCI_CONFIG_MEMORY_BAR_FLAG_MASK: u32 = 0xf; // 64-bit memory bar flag. const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 0x4; +// Prefetchable BAR bit +const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 0x8; // PCI config register size (4 bytes). const PCI_CONFIG_REGISTER_SIZE: usize = 4; // Number of BARs for a PCI device @@ -848,198 +1778,23 @@ const PCI_ROM_EXP_BAR_INDEX: usize = 12; impl PciDevice for VfioPciDevice { fn allocate_bars( &mut self, - allocator: &mut SystemAllocator, - ) -> std::result::Result, PciDeviceError> - { - let mut ranges = Vec::new(); - let mut bar_id = VFIO_PCI_BAR0_REGION_INDEX as u32; - - // Going through all regular regions to compute the BAR size. - // We're not saving the BAR address to restore it, because we - // are going to allocate a guest address for each BAR and write - // that new address back. - while bar_id < VFIO_PCI_CONFIG_REGION_INDEX { - let mut lsb_size: u32 = 0xffff_ffff; - let mut msb_size = 0; - let mut region_size: u64; - let bar_addr: GuestAddress; - - // Read the BAR size (Starts by all 1s to the BAR) - let bar_offset = if bar_id == VFIO_PCI_ROM_REGION_INDEX { - (PCI_ROM_EXP_BAR_INDEX * 4) as u32 - } else { - PCI_CONFIG_BAR_OFFSET + bar_id * 4 - }; - - self.vfio_pci_configuration - .write_config_dword(lsb_size, bar_offset); - lsb_size = self.vfio_pci_configuration.read_config_dword(bar_offset); - - // We've just read the BAR size back. Or at least its LSB. - let lsb_flag = lsb_size & PCI_CONFIG_MEMORY_BAR_FLAG_MASK; - - if lsb_size == 0 { - bar_id += 1; - continue; - } - - // Is this an IO BAR? - let io_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { - matches!(lsb_flag & PCI_CONFIG_IO_BAR, PCI_CONFIG_IO_BAR) - } else { - false - }; - - // Is this a 64-bit BAR? - let is_64bit_bar = if bar_id != VFIO_PCI_ROM_REGION_INDEX { - matches!( - lsb_flag & PCI_CONFIG_MEMORY_BAR_64BIT, - PCI_CONFIG_MEMORY_BAR_64BIT - ) - } else { - false - }; - - // By default, the region type is 32 bits memory BAR. - let mut region_type = PciBarRegionType::Memory32BitRegion; - - if io_bar { - #[cfg(target_arch = "x86_64")] - { - // IO BAR - region_type = PciBarRegionType::IoRegion; - - // Clear first bit. - lsb_size &= 0xffff_fffc; - - // Find the first bit that's set to 1. - let first_bit = lsb_size.trailing_zeros(); - region_size = 2u64.pow(first_bit); - // We need to allocate a guest PIO address range for that BAR. - // The address needs to be 4 bytes aligned. - bar_addr = allocator - .allocate_io_addresses(None, region_size, Some(0x4)) - .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; - } - #[cfg(target_arch = "aarch64")] - unimplemented!() - } else { - if is_64bit_bar { - // 64 bits Memory BAR - region_type = PciBarRegionType::Memory64BitRegion; - - msb_size = 0xffff_ffff; - let msb_bar_offset: u32 = PCI_CONFIG_BAR_OFFSET + (bar_id + 1) * 4; - - self.vfio_pci_configuration - .write_config_dword(msb_size, msb_bar_offset); - - msb_size = self - .vfio_pci_configuration - .read_config_dword(msb_bar_offset); - } - - // Clear the first four bytes from our LSB. - lsb_size &= 0xffff_fff0; - - region_size = u64::from(msb_size); - region_size <<= 32; - region_size |= u64::from(lsb_size); - - // Find the first that's set to 1. - let first_bit = region_size.trailing_zeros(); - region_size = 2u64.pow(first_bit); - - // We need to allocate a guest MMIO address range for that BAR. - // In case the BAR is mappable directly, this means it might be - // set as user memory region, which expects to deal with 4K - // pages. Therefore, the alignment has to be set accordingly. - let bar_alignment = if (bar_id == VFIO_PCI_ROM_REGION_INDEX) - || (self.device.get_region_flags(bar_id) & VFIO_REGION_INFO_FLAG_MMAP != 0) - { - // 4K alignment - 0x1000 - } else { - // Default 16 bytes alignment - 0x10 - }; - if is_64bit_bar { - bar_addr = allocator - .allocate_mmio_addresses(None, region_size, Some(bar_alignment)) - .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; - } else { - bar_addr = allocator - .allocate_mmio_hole_addresses(None, region_size, Some(bar_alignment)) - .ok_or(PciDeviceError::IoAllocationFailed(region_size))?; - } - } - - let reg_idx = if bar_id == VFIO_PCI_ROM_REGION_INDEX { - PCI_ROM_EXP_BAR_INDEX - } else { - bar_id as usize - }; - - // We can now build our BAR configuration block. - let config = PciBarConfiguration::default() - .set_register_index(reg_idx) - .set_address(bar_addr.raw_value()) - .set_size(region_size) - .set_region_type(region_type); - - if bar_id == VFIO_PCI_ROM_REGION_INDEX { - self.configuration - .add_pci_rom_bar(&config, lsb_flag & 0x1) - .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; - } else { - self.configuration - .add_pci_bar(&config) - .map_err(|e| PciDeviceError::IoRegistrationFailed(bar_addr.raw_value(), e))?; - } - - error!("Bar addr: {:?}", bar_addr); - ranges.push((bar_addr, region_size, region_type)); - self.mmio_regions.push(MmioRegion { - start: bar_addr, - length: region_size, - type_: region_type, - index: bar_id as u32, - mem_slot: None, - host_addr: None, - mmap_size: None, - }); - - - bar_id += 1; - if is_64bit_bar { - bar_id += 1; - } - } - - Ok(ranges) + allocator: &Arc>, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> Result, PciDeviceError> { + self.common + .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources) } fn free_bars( &mut self, allocator: &mut SystemAllocator, - ) -> std::result::Result<(), PciDeviceError> { - for region in self.mmio_regions.iter() { - match region.type_ { - PciBarRegionType::IoRegion => { - #[cfg(target_arch = "x86_64")] - allocator.free_io_addresses(region.start, region.length); - #[cfg(target_arch = "aarch64")] - error!("I/O region is not supported"); - } - PciBarRegionType::Memory32BitRegion => { - allocator.free_mmio_hole_addresses(region.start, region.length); - } - PciBarRegionType::Memory64BitRegion => { - allocator.free_mmio_addresses(region.start, region.length); - } - } - } - Ok(()) + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> Result<(), PciDeviceError> { + self.common + .free_bars(allocator, mmio32_allocator, mmio64_allocator) } fn write_config_register( @@ -1048,81 +1803,11 @@ impl PciDevice for VfioPciDevice { offset: u64, data: &[u8], ) -> Option> { - // When the guest wants to write to a BAR, we trap it into - // our local configuration space. We're not reprogramming - // VFIO device. - if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) - || reg_idx == PCI_ROM_EXP_BAR_INDEX - { - // We keep our local cache updated with the BARs. - // We'll read it back from there when the guest is asking - // for BARs (see read_config_register()). - self.configuration - .write_config_register(reg_idx, offset, data); - return None; - } - - let reg = (reg_idx * PCI_CONFIG_REGISTER_SIZE) as u64; - - // If the MSI or MSI-X capabilities are accessed, we need to - // update our local cache accordingly. - // Depending on how the capabilities are modified, this could - // trigger a VFIO MSI or MSI-X toggle. - if let Some((cap_id, cap_base)) = self.interrupt.accessed(reg) { - let cap_offset: u64 = reg - cap_base + offset; - match cap_id { - PciCapabilityId::MessageSignalledInterrupts => { - if let Err(e) = self.update_msi_capabilities(cap_offset, data) { - error!("Could not update MSI capabilities: {}", e); - } - } - PciCapabilityId::MsiX => { - if let Err(e) = self.update_msix_capabilities(cap_offset, data) { - error!("Could not update MSI-X capabilities: {}", e); - } - } - _ => {} - } - } - - // Make sure to write to the device's PCI config space after MSI/MSI-X - // interrupts have been enabled/disabled. In case of MSI, when the - // interrupts are enabled through VFIO (using VFIO_DEVICE_SET_IRQS), - // the MSI Enable bit in the MSI capability structure found in the PCI - // config space is disabled by default. That's why when the guest is - // enabling this bit, we first need to enable the MSI interrupts with - // VFIO through VFIO_DEVICE_SET_IRQS ioctl, and only after we can write - // to the device region to update the MSI Enable bit. - self.device - .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, reg + offset); - - None + self.common.write_config_register(reg_idx, offset, data) } fn read_config_register(&mut self, reg_idx: usize) -> u32 { - // When reading the BARs, we trap it and return what comes - // from our local configuration space. We want the guest to - // use that and not the VFIO device BARs as it does not map - // with the guest address space. - if (PCI_CONFIG_BAR0_INDEX..PCI_CONFIG_BAR0_INDEX + BAR_NUMS).contains(®_idx) - || reg_idx == PCI_ROM_EXP_BAR_INDEX - { - return self.configuration.read_reg(reg_idx); - } - - // Since we don't support passing multi-functions devices, we should - // mask the multi-function bit, bit 7 of the Header Type byte on the - // register 3. - let mask = if reg_idx == PCI_HEADER_TYPE_REG_INDEX { - 0xff7f_ffffu32 - } else { - 0xffff_ffffu32 - }; - - // The config register read comes from the VFIO device itself. - self.vfio_pci_configuration - .read_config_dword((reg_idx * 4) as u32) - & mask + self.common.read_config_register(reg_idx) } fn detect_bar_reprogramming( @@ -1130,96 +1815,57 @@ impl PciDevice for VfioPciDevice { reg_idx: usize, data: &[u8], ) -> Option { - self.configuration.detect_bar_reprogramming(reg_idx, data) + self.common + .configuration + .detect_bar_reprogramming(reg_idx, data) } fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { - let addr = base + offset; - if let Some(region) = self.find_region(addr) { - let offset = addr - region.start.raw_value(); - - if self.interrupt.msix_table_accessed(region.index, offset) { - self.interrupt.msix_read_table(offset, data); - } else { - self.device.region_read(region.index, data, offset); - } - } - - // INTx EOI - // The guest reading from the BAR potentially means the interrupt has - // been received and can be acknowledged. - if self.interrupt.intx_in_use() { - if let Err(e) = self.device.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { - error!("Failed unmasking INTx IRQ: {}", e); - } - } + self.common.read_bar(base, offset, data) } fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { - let addr = base + offset; - if let Some(region) = self.find_region(addr) { - let offset = addr - region.start.raw_value(); - - // If the MSI-X table is written to, we need to update our cache. - if self.interrupt.msix_table_accessed(region.index, offset) { - self.interrupt.msix_write_table(offset, data); - } else { - self.device.region_write(region.index, data, offset); - } - } - - // INTx EOI - // The guest writing to the BAR potentially means the interrupt has - // been received and can be acknowledged. - if self.interrupt.intx_in_use() { - if let Err(e) = self.device.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { - error!("Failed unmasking INTx IRQ: {}", e); - } - } - - None + self.common.write_bar(base, offset, data) } - fn move_bar(&mut self, old_base: u64, new_base: u64) -> result::Result<(), io::Error> { - for region in self.mmio_regions.iter_mut() { + fn move_bar(&mut self, old_base: u64, new_base: u64) -> Result<(), io::Error> { + for region in self.common.mmio_regions.iter_mut() { if region.start.raw_value() == old_base { region.start = GuestAddress(new_base); - if let Some(mem_slot) = region.mem_slot { - if let Some(host_addr) = region.host_addr { - let mmap_offset = self.device.get_region_offset(region.index); - let mmap_size = self.device.get_region_size(region.index); - - // Remove old region - let old_mem_region = Self::make_user_memory_region( - mem_slot, - old_base + mmap_offset, - 0, - host_addr as u64, - false, - false, - ); - - unsafe { self.vm.lock().expect("Poisoned lock") - .set_user_memory_region(old_mem_region) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; - } + for user_memory_region in region.user_memory_regions.iter_mut() { + // Remove old region + let old_mem_region = VfioCommon::make_user_memory_region( + user_memory_region.slot, + user_memory_region.start, + user_memory_region.size, + user_memory_region.host_addr, + false, + false, + ); - // Insert new region - let new_mem_region = Self::make_user_memory_region( - mem_slot, - new_base + mmap_offset, - mmap_size as u64, - host_addr as u64, - false, - false, - ); + unsafe { self.vm.lock().expect("Poisoned lock").set_user_memory_region(old_mem_region) } + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; - unsafe { self.vm.lock().expect("Poisoned lock") - .set_user_memory_region(new_mem_region) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; - } + // Update the user memory region with the correct start address. + if new_base > old_base { + user_memory_region.start += new_base - old_base; + } else { + user_memory_region.start -= old_base - new_base; } + + // Insert new region + let new_mem_region = VfioCommon::make_user_memory_region( + user_memory_region.slot, + user_memory_region.start, + user_memory_region.size, + user_memory_region.host_addr, + false, + false, + ); + + unsafe { self.vm.lock().expect("Poisoned lock").set_user_memory_region(new_mem_region) } + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; } } } @@ -1230,4 +1876,85 @@ impl PciDevice for VfioPciDevice { fn as_any(&mut self) -> &mut dyn Any { self } -} \ No newline at end of file + + fn id(&self) -> Option { + Some(self.id.clone()) + } +} + +/// This structure implements the ExternalDmaMapping trait. It is meant to +/// be used when the caller tries to provide a way to update the mappings +/// associated with a specific VFIO container. +pub struct VfioDmaMapping { + container: Arc, + memory: Arc, + mmio_regions: Arc>>, +} + +impl VfioDmaMapping { + /// Create a DmaMapping object. + /// # Parameters + /// * `container`: VFIO container object. + /// * `memory`: guest memory to mmap. + /// * `mmio_regions`: mmio_regions to mmap. + pub fn new( + container: Arc, + memory: Arc, + mmio_regions: Arc>>, + ) -> Self { + VfioDmaMapping { + container, + memory, + mmio_regions, + } + } +} + +impl ExternalDmaMapping for VfioDmaMapping { + fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> { + let mem = self.memory.memory(); + let guest_addr = GuestAddress(gpa); + let user_addr = if mem.check_range(guest_addr, size as usize) { + match mem.get_host_address(guest_addr) { + Ok(t) => t as u64, + Err(e) => { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}") + )); + } + } + } else if self.mmio_regions.lock().unwrap().check_range(gpa, size) { + self.mmio_regions.lock().unwrap().find_user_address(gpa)? + } else { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("failed to locate guest address 0x{gpa:x} in guest memory"), + )); + }; + + self.container + .vfio_dma_map(iova, size, user_addr) + .map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!( + "failed to map memory for VFIO container, \ + iova 0x{iova:x}, gpa 0x{gpa:x}, size 0x{size:x}: {e:?}" + ), + ) + }) + } + + fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> { + self.container.vfio_dma_unmap(iova, size).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!( + "failed to unmap memory for VFIO container, \ + iova 0x{iova:x}, size 0x{size:x}: {e:?}" + ), + ) + }) + } +} diff --git a/src/vm-device/Cargo.toml b/src/vm-device/Cargo.toml index 7c9d1864b7a..4693afabb53 100644 --- a/src/vm-device/Cargo.toml +++ b/src/vm-device/Cargo.toml @@ -1,16 +1,18 @@ [package] +authors = ["The Cloud Hypervisor Authors"] +edition = "2021" name = "vm-device" version = "0.1.0" -authors = ["The Cloud Hypervisor Authors"] -edition = "2018" + +[features] +default = [] +kvm = ["vfio-ioctls/kvm"] +mshv = ["vfio-ioctls/mshv"] [dependencies] -anyhow = "1.0" -thiserror = "1.0" -serde = {version = ">=1.0.27", features = ["rc"] } -serde_derive = ">=1.0.27" -serde_json = ">=1.0.9" +anyhow = "1.0.87" +serde = { version = "1.0.208", features = ["derive", "rc"] } +thiserror = "1.0.62" vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", branch = "main" } vm-memory = { version = "0.15.0", features = ["backend-mmap"] } vmm-sys-util = ">=0.3.1" - diff --git a/src/vm-device/src/bus.rs b/src/vm-device/src/bus.rs index 3388ee20514..3817d443f35 100644 --- a/src/vm-device/src/bus.rs +++ b/src/vm-device/src/bus.rs @@ -7,7 +7,7 @@ //! Handles routing to devices in an address space. -use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; +use std::cmp::Ordering; use std::collections::btree_map::BTreeMap; use std::sync::{Arc, Barrier, Mutex, RwLock, Weak}; use std::{convert, error, fmt, io, result}; @@ -24,8 +24,31 @@ pub trait BusDevice: Send { fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { None } - /// Triggers the `irq_mask` interrupt on this device - fn interrupt(&self, irq_mask: u32) {} +} + +#[allow(unused_variables)] +pub trait BusDeviceSync: Send + Sync { + /// Reads at `offset` from this device + fn read(&self, base: u64, offset: u64, data: &mut [u8]) {} + /// Writes at `offset` into this device + fn write(&self, base: u64, offset: u64, data: &[u8]) -> Option> { + None + } +} + +impl BusDeviceSync for Mutex { + /// Reads at `offset` from this device + fn read(&self, base: u64, offset: u64, data: &mut [u8]) { + self.lock() + .expect("Failed to acquire device lock") + .read(base, offset, data) + } + /// Writes at `offset` into this device + fn write(&self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.lock() + .expect("Failed to acquire device lock") + .write(base, offset, data) + } } #[derive(Debug)] @@ -42,7 +65,7 @@ pub type Result = result::Result; impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "bus_error: {:?}", self) + write!(f, "bus_error: {self:?}") } } @@ -87,7 +110,7 @@ impl Ord for BusRange { impl PartialOrd for BusRange { fn partial_cmp(&self, other: &BusRange) -> Option { - self.base.partial_cmp(&other.base) + Some(self.cmp(other)) } } @@ -97,7 +120,7 @@ impl PartialOrd for BusRange { /// only restriction is that no two devices can overlap in this address space. #[derive(Default)] pub struct Bus { - devices: RwLock>>>, + devices: RwLock>>, } impl Bus { @@ -108,17 +131,16 @@ impl Bus { } } - fn first_before(&self, addr: u64) -> Option<(BusRange, Arc>)> { + fn first_before(&self, addr: u64) -> Option<(BusRange, Arc)> { let devices = self.devices.read().unwrap(); let (range, dev) = devices .range(..=BusRange { base: addr, len: 1 }) - .rev() - .next()?; + .next_back()?; dev.upgrade().map(|d| (*range, d.clone())) } #[allow(clippy::type_complexity)] - pub fn resolve(&self, addr: u64) -> Option<(u64, u64, Arc>)> { + fn resolve(&self, addr: u64) -> Option<(u64, u64, Arc)> { if let Some((range, dev)) = self.first_before(addr) { let offset = addr - range.base; if offset < range.len { @@ -128,8 +150,7 @@ impl Bus { None } - /// Puts the given device at the given address space. - pub fn insert(&self, device: Arc>, base: u64, len: u64) -> Result<()> { + pub fn insert(&self, device: Arc, base: u64, len: u64) -> Result<()> { if len == 0 { return Err(Error::ZeroSizedRange); } @@ -174,7 +195,7 @@ impl Bus { } /// Removes all entries referencing the given device. - pub fn remove_by_device(&self, device: &Arc>) -> Result<()> { + pub fn remove_by_device(&self, device: &Arc) -> Result<()> { let mut device_list = self.devices.write().unwrap(); let mut remove_key_list = Vec::new(); @@ -219,9 +240,7 @@ impl Bus { pub fn read(&self, addr: u64, data: &mut [u8]) -> Result<()> { if let Some((base, offset, dev)) = self.resolve(addr) { // OK to unwrap as lock() failing is a serious error condition and should panic. - dev.lock() - .expect("Failed to acquire device lock") - .read(base, offset, data); + dev.read(base, offset, data); Ok(()) } else { Err(Error::MissingAddressRange) @@ -234,10 +253,7 @@ impl Bus { pub fn write(&self, addr: u64, data: &[u8]) -> Result>> { if let Some((base, offset, dev)) = self.resolve(addr) { // OK to unwrap as lock() failing is a serious error condition and should panic. - Ok(dev - .lock() - .expect("Failed to acquire device lock") - .write(base, offset, data)) + Ok(dev.write(base, offset, data)) } else { Err(Error::MissingAddressRange) } @@ -249,17 +265,17 @@ mod tests { use super::*; struct DummyDevice; - impl BusDevice for DummyDevice {} + impl BusDeviceSync for DummyDevice {} struct ConstantDevice; - impl BusDevice for ConstantDevice { - fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + impl BusDeviceSync for ConstantDevice { + fn read(&self, _base: u64, offset: u64, data: &mut [u8]) { for (i, v) in data.iter_mut().enumerate() { *v = (offset as u8) + (i as u8); } } - fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + fn write(&self, _base: u64, offset: u64, data: &[u8]) -> Option> { for (i, v) in data.iter().enumerate() { assert_eq!(*v, (offset as u8) + (i as u8)) } @@ -271,56 +287,55 @@ mod tests { #[test] fn bus_insert() { let bus = Bus::new(); - let dummy = Arc::new(Mutex::new(DummyDevice)); - assert!(bus.insert(dummy.clone(), 0x10, 0).is_err()); - assert!(bus.insert(dummy.clone(), 0x10, 0x10).is_ok()); + let dummy = Arc::new(DummyDevice); + bus.insert(dummy.clone(), 0x10, 0).unwrap_err(); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); let result = bus.insert(dummy.clone(), 0x0f, 0x10); - assert!(result.is_err()); - assert_eq!(format!("{:?}", result), "Err(Overlap)"); - - assert!(bus.insert(dummy.clone(), 0x10, 0x10).is_err()); - assert!(bus.insert(dummy.clone(), 0x10, 0x15).is_err()); - assert!(bus.insert(dummy.clone(), 0x12, 0x15).is_err()); - assert!(bus.insert(dummy.clone(), 0x12, 0x01).is_err()); - assert!(bus.insert(dummy.clone(), 0x0, 0x20).is_err()); - assert!(bus.insert(dummy.clone(), 0x20, 0x05).is_ok()); - assert!(bus.insert(dummy.clone(), 0x25, 0x05).is_ok()); - assert!(bus.insert(dummy, 0x0, 0x10).is_ok()); + assert_eq!(format!("{result:?}"), "Err(Overlap)"); + + bus.insert(dummy.clone(), 0x10, 0x10).unwrap_err(); + bus.insert(dummy.clone(), 0x10, 0x15).unwrap_err(); + bus.insert(dummy.clone(), 0x12, 0x15).unwrap_err(); + bus.insert(dummy.clone(), 0x12, 0x01).unwrap_err(); + bus.insert(dummy.clone(), 0x0, 0x20).unwrap_err(); + bus.insert(dummy.clone(), 0x20, 0x05).unwrap(); + bus.insert(dummy.clone(), 0x25, 0x05).unwrap(); + bus.insert(dummy, 0x0, 0x10).unwrap(); } #[test] #[allow(clippy::redundant_clone)] fn bus_read_write() { let bus = Bus::new(); - let dummy = Arc::new(Mutex::new(DummyDevice)); - assert!(bus.insert(dummy.clone(), 0x10, 0x10).is_ok()); - assert!(bus.read(0x10, &mut [0, 0, 0, 0]).is_ok()); - assert!(bus.write(0x10, &[0, 0, 0, 0]).is_ok()); - assert!(bus.read(0x11, &mut [0, 0, 0, 0]).is_ok()); - assert!(bus.write(0x11, &[0, 0, 0, 0]).is_ok()); - assert!(bus.read(0x16, &mut [0, 0, 0, 0]).is_ok()); - assert!(bus.write(0x16, &[0, 0, 0, 0]).is_ok()); - assert!(bus.read(0x20, &mut [0, 0, 0, 0]).is_err()); - assert!(bus.write(0x20, &[0, 0, 0, 0]).is_err()); - assert!(bus.read(0x06, &mut [0, 0, 0, 0]).is_err()); - assert!(bus.write(0x06, &[0, 0, 0, 0]).is_err()); + let dummy = Arc::new(DummyDevice); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + bus.read(0x10, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x10, &[0, 0, 0, 0]).unwrap(); + bus.read(0x11, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x11, &[0, 0, 0, 0]).unwrap(); + bus.read(0x16, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x16, &[0, 0, 0, 0]).unwrap(); + bus.read(0x20, &mut [0, 0, 0, 0]).unwrap_err(); + bus.write(0x20, &[0, 0, 0, 0]).unwrap_err(); + bus.read(0x06, &mut [0, 0, 0, 0]).unwrap_err(); + bus.write(0x06, &[0, 0, 0, 0]).unwrap_err(); } #[test] #[allow(clippy::redundant_clone)] fn bus_read_write_values() { let bus = Bus::new(); - let dummy = Arc::new(Mutex::new(ConstantDevice)); - assert!(bus.insert(dummy.clone(), 0x10, 0x10).is_ok()); + let dummy = Arc::new(ConstantDevice); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); let mut values = [0, 1, 2, 3]; - assert!(bus.read(0x10, &mut values).is_ok()); + bus.read(0x10, &mut values).unwrap(); assert_eq!(values, [0, 1, 2, 3]); - assert!(bus.write(0x10, &values).is_ok()); - assert!(bus.read(0x15, &mut values).is_ok()); + bus.write(0x10, &values).unwrap(); + bus.read(0x15, &mut values).unwrap(); assert_eq!(values, [5, 6, 7, 8]); - assert!(bus.write(0x15, &values).is_ok()); + bus.write(0x15, &values).unwrap(); } #[test] @@ -337,10 +352,10 @@ mod tests { let bus = Bus::new(); let mut data = [1, 2, 3, 4]; - let device = Arc::new(Mutex::new(DummyDevice)); - assert!(bus.insert(device.clone(), 0x10, 0x10).is_ok()); - assert!(bus.write(0x10, &data).is_ok()); - assert!(bus.read(0x10, &mut data).is_ok()); + let device = Arc::new(DummyDevice); + bus.insert(device.clone(), 0x10, 0x10).unwrap(); + bus.write(0x10, &data).unwrap(); + bus.read(0x10, &mut data).unwrap(); assert_eq!(data, [1, 2, 3, 4]); } diff --git a/src/vm-device/src/dma_mapping/mod.rs b/src/vm-device/src/dma_mapping/mod.rs index 62b5ceb1ced..69cd880eea3 100644 --- a/src/vm-device/src/dma_mapping/mod.rs +++ b/src/vm-device/src/dma_mapping/mod.rs @@ -2,8 +2,8 @@ // // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -pub mod vfio; - +/// Trait to trigger DMA mapping updates for devices managed by virtio-iommu +/// /// Trait meant for triggering the DMA mapping update related to an external /// device not managed fully through virtio. It is dedicated to virtio-iommu /// in order to trigger the map update anytime the mapping is updated from the diff --git a/src/vm-device/src/dma_mapping/vfio.rs b/src/vm-device/src/dma_mapping/vfio.rs deleted file mode 100644 index 5ed7f516887..00000000000 --- a/src/vm-device/src/dma_mapping/vfio.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause - -use crate::dma_mapping::ExternalDmaMapping; -use std::io; -use std::sync::Arc; -use vfio_ioctls::VfioContainer; -use vm_memory::{GuestAddress, GuestAddressSpace, GuestMemory}; - -/// This structure implements the ExternalDmaMapping trait. It is meant to -/// be used when the caller tries to provide a way to update the mappings -/// associated with a specific VFIO container. -pub struct VfioDmaMapping { - container: Arc, - memory: Arc, -} - -impl VfioDmaMapping { - /// Create a DmaMapping object. - /// - /// # Parameters - /// * `container`: VFIO container object. - /// * `memory·: guest memory to mmap. - pub fn new(container: Arc, memory: Arc) -> Self { - VfioDmaMapping { container, memory } - } -} - -impl ExternalDmaMapping for VfioDmaMapping { - fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), io::Error> { - let mem = self.memory.memory(); - let guest_addr = GuestAddress(gpa); - let user_addr = if mem.check_range(guest_addr, size as usize) { - mem.get_host_address(guest_addr).unwrap() as u64 - } else { - return Err(io::Error::new( - io::ErrorKind::Other, - format!( - "failed to convert guest address 0x{:x} into \ - host user virtual address", - gpa - ), - )); - }; - - self.container - .vfio_dma_map(iova, size, user_addr) - .map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!( - "failed to map memory for VFIO container, \ - iova 0x{:x}, gpa 0x{:x}, size 0x{:x}: {:?}", - iova, gpa, size, e - ), - ) - }) - } - - fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), io::Error> { - self.container.vfio_dma_unmap(iova, size).map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!( - "failed to unmap memory for VFIO container, \ - iova 0x{:x}, size 0x{:x}: {:?}", - iova, size, e - ), - ) - }) - } -} diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs index 02fb028743c..7bdf7940a47 100644 --- a/src/vm-device/src/interrupt/mod.rs +++ b/src/vm-device/src/interrupt/mod.rs @@ -123,7 +123,7 @@ pub struct MsiIrqGroupConfig { /// /// The InterruptManager implementations should protect itself from concurrent accesses internally, /// so it could be invoked from multi-threaded context. -pub trait InterruptManager: { +pub trait InterruptManager: Send + Sync { type GroupConfig; /// Create an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object to manage @@ -136,8 +136,7 @@ pub trait InterruptManager: { /// * interrupt_type: type of interrupt source. /// * base: base Interrupt Source ID to be managed by the group object. /// * count: number of Interrupt Sources to be managed by the group object. - fn create_group(&self, config: Self::GroupConfig) - -> Result>>; + fn create_group(&self, config: Self::GroupConfig) -> Result>; /// Destroy an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object created by /// [create_group()](trait.InterruptManager.html#tymethod.create_group). @@ -145,7 +144,7 @@ pub trait InterruptManager: { /// Assume the caller takes the responsibility to disable all interrupt sources of the group /// before calling destroy_group(). This assumption helps to simplify InterruptSourceGroup /// implementations. - fn destroy_group(&self, group: Arc>) -> Result<()>; + fn destroy_group(&self, group: Arc) -> Result<()>; } pub trait InterruptSourceGroup: Send + Sync { @@ -179,19 +178,16 @@ pub trait InterruptSourceGroup: Send + Sync { /// # Arguments /// * index: sub-index into the group. /// * config: configuration data for the interrupt source. - fn update(&self, index: InterruptIndex, config: InterruptSourceConfig) -> Result<()>; - - /// Mask an interrupt from this interrupt source. - fn mask(&self, _index: InterruptIndex) -> Result<()> { - // Not all interrupt sources can be disabled. - // To accommodate this, we can have a no-op here. - Ok(()) - } - - /// Unmask an interrupt from this interrupt source. - fn unmask(&self, _index: InterruptIndex) -> Result<()> { - // Not all interrupt sources can be disabled. - // To accommodate this, we can have a no-op here. - Ok(()) - } + /// * masked: if the interrupt is masked + /// * set_gsi: whether update the GSI routing table. + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> Result<()>; + + /// Set the interrupt group GSI routing table. + fn set_gsi(&self) -> Result<()>; } diff --git a/src/vm-device/src/lib.rs b/src/vm-device/src/lib.rs index f5977a16144..c10731ea95a 100644 --- a/src/vm-device/src/lib.rs +++ b/src/vm-device/src/lib.rs @@ -3,25 +3,16 @@ // SPDX-License-Identifier: Apache-2.0 // -#[macro_use] -extern crate serde_derive; -extern crate vm_memory; - -use std::io; +use serde::{Deserialize, Serialize}; mod bus; pub mod dma_mapping; pub mod interrupt; -pub use self::bus::{Bus, BusDevice, Error as BusError}; - -#[derive(Debug)] -pub enum Error { - IoError(io::Error), -} +pub use self::bus::{Bus, BusDevice, BusDeviceSync, Error as BusError}; /// Type of Message Signalled Interrupt -#[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum MsiIrqType { /// PCI MSI IRQ numbers. PciMsi, @@ -31,6 +22,13 @@ pub enum MsiIrqType { GenericMsi, } +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +pub enum PciBarType { + Io, + Mmio32, + Mmio64, +} + /// Enumeration for device resources. #[allow(missing_docs)] #[derive(Clone, Debug, Serialize, Deserialize)] @@ -39,6 +37,14 @@ pub enum Resource { PioAddressRange { base: u16, size: u16 }, /// Memory Mapped IO address range. MmioAddressRange { base: u64, size: u64 }, + /// PCI BAR + PciBar { + index: usize, + base: u64, + size: u64, + type_: PciBarType, + prefetchable: bool, + }, /// Legacy IRQ number. LegacyIrq(u32), /// Message Signaled Interrupt diff --git a/src/vm-system-allocator/Cargo.toml b/src/vm-system-allocator/Cargo.toml index 68253138453..8f33386e8cb 100644 --- a/src/vm-system-allocator/Cargo.toml +++ b/src/vm-system-allocator/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "vm-system-allocator" -version = "0.1.0" authors = ["The Chromium OS Authors"] -edition = "2018" +edition = "2021" +version = "0.1.0" [dependencies] -libc = "0.2.159" +libc = "0.2.158" vm-memory = "0.15.0" diff --git a/src/vm-system-allocator/src/address.rs b/src/vm-system-allocator/src/address.rs index 30d8ec5ef17..14e0335cdac 100644 --- a/src/vm-system-allocator/src/address.rs +++ b/src/vm-system-allocator/src/address.rs @@ -9,6 +9,7 @@ use std::collections::btree_map::BTreeMap; use std::result; + use vm_memory::{Address, GuestAddress, GuestUsize}; #[derive(Debug)] @@ -201,6 +202,16 @@ impl AddressAllocator { } } } + + /// Start address of the allocator + pub fn base(&self) -> GuestAddress { + self.base + } + + /// Last address of the allocator + pub fn end(&self) -> GuestAddress { + self.end + } } #[cfg(test)] @@ -209,10 +220,7 @@ mod tests { #[test] fn new_fails_overflow() { - assert_eq!( - AddressAllocator::new(GuestAddress(u64::max_value()), 0x100), - None - ); + assert_eq!(AddressAllocator::new(GuestAddress(u64::MAX), 0x100), None); } #[test] diff --git a/src/vm-system-allocator/src/arch/aarch64/layout.rs b/src/vm-system-allocator/src/arch/aarch64/layout.rs deleted file mode 100644 index 922cfbb66e6..00000000000 --- a/src/vm-system-allocator/src/arch/aarch64/layout.rs +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -// ==== Address map in use in ARM development systems today ==== -// -// - 32-bit - - 36-bit - - 40-bit - -// 1024GB + + +-------------------+ <- 40-bit -// | | DRAM | -// ~ ~ ~ ~ -// | | | -// | | | -// | | | -// | | | -// 544GB + + +-------------------+ -// | | Hole or DRAM | -// | | | -// 512GB + + +-------------------+ -// | | Mapped | -// | | I/O | -// ~ ~ ~ ~ -// | | | -// 256GB + + +-------------------+ -// | | Reserved | -// ~ ~ ~ ~ -// | | | -// 64GB + +-----------------------+-------------------+ <- 36-bit -// | | DRAM | -// ~ ~ ~ ~ -// | | | -// | | | -// 34GB + +-----------------------+-------------------+ -// | | Hole or DRAM | -// 32GB + +-----------------------+-------------------+ -// | | Mapped I/O | -// ~ ~ ~ ~ -// | | | -// 16GB + +-----------------------+-------------------+ -// | | Reserved | -// ~ ~ ~ ~ -// 4GB +-------------------+-----------------------+-------------------+ <- 32-bit -// | 2GB of DRAM | -// | | -// 2GB +-------------------+-----------------------+-------------------+ -// | Mapped I/O | -// 1GB +-------------------+-----------------------+-------------------+ -// | ROM & RAM & I/O | -// 0GB +-------------------+-----------------------+-------------------+ 0 -// - 32-bit - - 36-bit - - 40-bit - -// -// Taken from (http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf). - -/// Start of RAM on 64 bit ARM. -pub const DRAM_MEM_START: u64 = 0x8000_0000; // 2 GB. -/// The maximum RAM size. -pub const DRAM_MEM_MAX_SIZE: usize = 0x00FF_8000_0000; // 1024 - 2 = 1022G. - -/// Start of RAM on 64 bit ARM. -pub const SYSTEM_MEM_START: u64 = DRAM_MEM_START; - -/// This is used by ACPI device manager for acpi tables or devices like vmgenid -/// In reality, 2MBs is an overkill, but immediately after this we write the kernel -/// image, which needs to be 2MB aligned. -pub const SYSTEM_MEM_SIZE: u64 = 0x20_0000; - -/// Kernel command line maximum size. -/// As per `arch/arm64/include/uapi/asm/setup.h`. -pub const CMDLINE_MAX_SIZE: usize = 2048; - -/// Maximum size of the device tree blob as specified in https://www.kernel.org/doc/Documentation/arm64/booting.txt. -pub const FDT_MAX_SIZE: usize = 0x20_0000; - -// As per virt/kvm/arm/vgic/vgic-kvm-device.c we need -// the number of interrupts our GIC will support to be: -// * bigger than 32 -// * less than 1023 and -// * a multiple of 32. -/// The highest usable SPI on aarch64. -pub const IRQ_MAX: u32 = 128; - -/// First usable interrupt on aarch64. -pub const IRQ_BASE: u32 = 32; - -/// Below this address will reside the GIC, above this address will reside the MMIO devices. -pub const MAPPED_IO_START: u64 = 1 << 30; // 1 GB diff --git a/src/vm-system-allocator/src/arch/aarch64/mod.rs b/src/vm-system-allocator/src/arch/aarch64/mod.rs deleted file mode 100644 index 6bbb87c941b..00000000000 --- a/src/vm-system-allocator/src/arch/aarch64/mod.rs +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -/// Layout for this aarch64 system. -pub mod layout; -/// Logic for configuring aarch64 registers. - -/// The start of the memory area reserved for MMIO devices. -pub const MMIO_MEM_START: u64 = layout::MAPPED_IO_START; -/// The size of the memory area reserved for MMIO devices. -pub const MMIO_MEM_SIZE: u64 = layout::DRAM_MEM_START - layout::MAPPED_IO_START; //>> 1GB diff --git a/src/vm-system-allocator/src/arch/mod.rs b/src/vm-system-allocator/src/arch/mod.rs deleted file mode 100644 index d2faa14ae09..00000000000 --- a/src/vm-system-allocator/src/arch/mod.rs +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -/// Module for aarch64 related functionality. -#[cfg(target_arch = "aarch64")] -pub mod aarch64; - -#[cfg(target_arch = "aarch64")] -pub use aarch64::{ - layout::CMDLINE_MAX_SIZE, layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, - layout::SYSTEM_MEM_START, -}; - -/// Module for x86_64 related functionality. -#[cfg(target_arch = "x86_64")] -pub mod x86_64; - -#[cfg(target_arch = "x86_64")] -pub use crate::arch::x86_64::{layout::IRQ_BASE, layout::IRQ_MAX,}; diff --git a/src/vm-system-allocator/src/arch/x86_64/layout.rs b/src/vm-system-allocator/src/arch/x86_64/layout.rs deleted file mode 100644 index 1241508efb6..00000000000 --- a/src/vm-system-allocator/src/arch/x86_64/layout.rs +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 -// - -// Typically, on x86 systems 24 IRQs are used (0-23). -/// First usable IRQ ID for virtio device interrupts on x86_64. -pub const IRQ_BASE: u32 = 5; -/// Last usable IRQ ID for virtio device interrupts on x86_64. -pub const IRQ_MAX: u32 = 23; diff --git a/src/vm-system-allocator/src/arch/x86_64/mod.rs b/src/vm-system-allocator/src/arch/x86_64/mod.rs deleted file mode 100644 index cc401574613..00000000000 --- a/src/vm-system-allocator/src/arch/x86_64/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 -// -// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the THIRD-PARTY file. - -/// Layout for the x86_64 system. -pub mod layout; diff --git a/src/vm-system-allocator/src/gsi.rs b/src/vm-system-allocator/src/gsi.rs index ed01bbfd4cb..19dec1776f5 100644 --- a/src/vm-system-allocator/src/gsi.rs +++ b/src/vm-system-allocator/src/gsi.rs @@ -63,7 +63,6 @@ impl GsiAllocator { } #[cfg(target_arch = "aarch64")] - #[allow(clippy::new_without_default)] /// New GSI allocator pub fn new() -> Self { GsiAllocator { @@ -106,3 +105,10 @@ impl GsiAllocator { Ok(irq) } } + +#[cfg(target_arch = "aarch64")] +impl Default for GsiAllocator { + fn default() -> Self { + GsiAllocator::new() + } +} diff --git a/src/vm-system-allocator/src/lib.rs b/src/vm-system-allocator/src/lib.rs index c911a87792a..801eff9faaa 100644 --- a/src/vm-system-allocator/src/lib.rs +++ b/src/vm-system-allocator/src/lib.rs @@ -10,13 +10,11 @@ //! Manages system resources that can be allocated to VMs and their devices. -extern crate libc; -extern crate vm_memory; - mod address; mod gsi; +/// page size related utility functions +pub mod page_size; mod system; -mod arch; pub use crate::address::AddressAllocator; pub use crate::gsi::GsiAllocator; diff --git a/src/vm-system-allocator/src/page_size.rs b/src/vm-system-allocator/src/page_size.rs new file mode 100644 index 00000000000..96ae01edf79 --- /dev/null +++ b/src/vm-system-allocator/src/page_size.rs @@ -0,0 +1,38 @@ +// Copyright 2023 Arm Limited (or its affiliates). All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use libc::{sysconf, _SC_PAGESIZE}; + +/// get host page size +pub fn get_page_size() -> u64 { + // SAFETY: FFI call. Trivially safe. + unsafe { sysconf(_SC_PAGESIZE) as u64 } +} + +/// round up address to let it align page size +pub fn align_page_size_up(address: u64) -> u64 { + let page_size = get_page_size(); + (address + page_size - 1) & !(page_size - 1) +} + +/// round down address to let it align page size +pub fn align_page_size_down(address: u64) -> u64 { + let page_size = get_page_size(); + address & !(page_size - 1) +} + +/// Test if address is 4k aligned +pub fn is_4k_aligned(address: u64) -> bool { + (address & 0xfff) == 0 +} + +/// Test if size is 4k aligned +pub fn is_4k_multiple(size: u64) -> bool { + (size & 0xfff) == 0 +} + +/// Test if address is page size aligned +pub fn is_page_size_aligned(address: u64) -> bool { + let page_size = get_page_size(); + address & (page_size - 1) == 0 +} diff --git a/src/vm-system-allocator/src/system.rs b/src/vm-system-allocator/src/system.rs index e73ee8b4844..e4031bfa22a 100644 --- a/src/vm-system-allocator/src/system.rs +++ b/src/vm-system-allocator/src/system.rs @@ -13,15 +13,7 @@ use crate::address::AddressAllocator; use crate::gsi::GsiAllocator; #[cfg(target_arch = "x86_64")] use crate::gsi::GsiApic; - -use libc::{sysconf, _SC_PAGESIZE}; - -/// Safe wrapper for `sysconf(_SC_PAGESIZE)`. -#[inline(always)] -fn pagesize() -> usize { - // Trivially safe - unsafe { sysconf(_SC_PAGESIZE) as usize } -} +use crate::page_size::get_page_size; /// Manages allocating system resources such as address space and interrupt numbers. /// @@ -37,53 +29,49 @@ fn pagesize() -> usize { /// #[cfg(target_arch = "x86_64")] GuestAddress(0x1000), /// #[cfg(target_arch = "x86_64")] 0x10000, /// GuestAddress(0x10000000), 0x10000000, -/// GuestAddress(0x20000000), 0x100000, /// #[cfg(target_arch = "x86_64")] vec![GsiApic::new(5, 19)]).unwrap(); /// #[cfg(target_arch = "x86_64")] /// assert_eq!(allocator.allocate_irq(), Some(5)); /// #[cfg(target_arch = "aarch64")] -/// assert_eq!(allocator.allocate_irq(), Some(0)); +/// assert_eq!(allocator.allocate_irq(), Some(32)); /// #[cfg(target_arch = "x86_64")] /// assert_eq!(allocator.allocate_irq(), Some(6)); /// #[cfg(target_arch = "aarch64")] -/// assert_eq!(allocator.allocate_irq(), Some(1)); -/// assert_eq!(allocator.allocate_mmio_addresses(None, 0x1000, Some(0x1000)), Some(GuestAddress(0x1fff_f000))); +/// assert_eq!(allocator.allocate_irq(), Some(33)); +/// assert_eq!(allocator.allocate_platform_mmio_addresses(None, 0x1000, Some(0x1000)), Some(GuestAddress(0x1fff_f000))); /// /// ``` pub struct SystemAllocator { #[cfg(target_arch = "x86_64")] io_address_space: AddressAllocator, - mmio_address_space: AddressAllocator, - mmio_hole_address_space: AddressAllocator, + platform_mmio_address_space: AddressAllocator, gsi_allocator: GsiAllocator, } impl SystemAllocator { - /// Creates a new `SystemAllocator` for managing addresses and irq numvers. + /// Creates a new `SystemAllocator` for managing addresses and irq numbers. /// Can return `None` if `base` + `size` overflows a u64 /// /// * `io_base` - (X86) The starting address of IO memory. /// * `io_size` - (X86) The size of IO memory. - /// * `mmio_base` - The starting address of MMIO memory. - /// * `mmio_size` - The size of MMIO memory. - /// * `mmio_hole_base` - The starting address of MMIO memory in 32-bit address space. - /// * `mmio_hole_size` - The size of MMIO memory in 32-bit address space. + /// * `platform_mmio_base` - The starting address of platform MMIO memory. + /// * `platform_mmio_size` - The size of platform MMIO memory. /// * `apics` - (X86) Vector of APIC's. /// pub fn new( #[cfg(target_arch = "x86_64")] io_base: GuestAddress, #[cfg(target_arch = "x86_64")] io_size: GuestUsize, - mmio_base: GuestAddress, - mmio_size: GuestUsize, - mmio_hole_base: GuestAddress, - mmio_hole_size: GuestUsize, + platform_mmio_base: GuestAddress, + platform_mmio_size: GuestUsize, #[cfg(target_arch = "x86_64")] apics: Vec, ) -> Option { Some(SystemAllocator { #[cfg(target_arch = "x86_64")] io_address_space: AddressAllocator::new(io_base, io_size)?, - mmio_address_space: AddressAllocator::new(mmio_base, mmio_size)?, - mmio_hole_address_space: AddressAllocator::new(mmio_hole_base, mmio_hole_size)?, + platform_mmio_address_space: AddressAllocator::new( + platform_mmio_base, + platform_mmio_size, + )?, #[cfg(target_arch = "x86_64")] gsi_allocator: GsiAllocator::new(apics), #[cfg(target_arch = "aarch64")] @@ -113,31 +101,17 @@ impl SystemAllocator { .allocate(address, size, Some(align_size.unwrap_or(0x1))) } - /// Reserves a section of `size` bytes of MMIO address space. - pub fn allocate_mmio_addresses( - &mut self, - address: Option, - size: GuestUsize, - align_size: Option, - ) -> Option { - self.mmio_address_space.allocate( - address, - size, - Some(align_size.unwrap_or(pagesize() as u64)), - ) - } - - /// Reserves a section of `size` bytes of MMIO address space. - pub fn allocate_mmio_hole_addresses( + /// Reserves a section of `size` bytes of platform MMIO address space. + pub fn allocate_platform_mmio_addresses( &mut self, address: Option, size: GuestUsize, align_size: Option, ) -> Option { - self.mmio_hole_address_space.allocate( + self.platform_mmio_address_space.allocate( address, size, - Some(align_size.unwrap_or(pagesize() as u64)), + Some(align_size.unwrap_or_else(get_page_size)), ) } @@ -148,15 +122,9 @@ impl SystemAllocator { self.io_address_space.free(address, size) } - /// Free an MMIO address range. - /// We can only free a range if it matches exactly an already allocated range. - pub fn free_mmio_addresses(&mut self, address: GuestAddress, size: GuestUsize) { - self.mmio_address_space.free(address, size) - } - - /// Free an MMIO address range from the 32 bits hole. + /// Free a platform MMIO address range. /// We can only free a range if it matches exactly an already allocated range. - pub fn free_mmio_hole_addresses(&mut self, address: GuestAddress, size: GuestUsize) { - self.mmio_hole_address_space.free(address, size) + pub fn free_platform_mmio_addresses(&mut self, address: GuestAddress, size: GuestUsize) { + self.platform_mmio_address_space.free(address, size) } } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index e37c6de0ddf..26c05006811 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -28,8 +28,7 @@ use seccompiler::BpfThreadMap; use userfaultfd::Uffd; use utils::time::TimestampUs; use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd}; -use vm_allocator::AddressAllocator; -use vm_system_allocator::{GsiApic, SystemAllocator}; +use vm_system_allocator::{GsiApic, AddressAllocator, SystemAllocator}; use vm_device::interrupt::{InterruptManager, MsiIrqGroupConfig}; use vm_memory::ReadVolatile; #[cfg(target_arch = "aarch64")] @@ -42,7 +41,7 @@ compile_error!("GDB feature not supported on ARM"); #[cfg(target_arch = "x86_64")] use crate::acpi; -use crate::arch::InitrdConfig; +use crate::arch::{InitrdConfig, MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{ @@ -182,12 +181,26 @@ fn add_vfio_device( memory: GuestMemoryMmap, allocator: Arc> ) { + // alignment 4 << 10 + let pci_mmio32_allocator = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(MEM_32BIT_DEVICES_START), MEM_32BIT_DEVICES_SIZE).unwrap(), + )); + + // alignment 4 << 30 + let pci_mmio64_allocator = Arc::new(Mutex::new( + AddressAllocator::new( + GuestAddress(0), + mmio_address_space_size(46), + ).unwrap() + )); + // We need to shift the device id since the 3 first bits // are dedicated to the PCI function, and we know we don't // do multifunction. Also, because we only support one PCI // bus, the bus 0, we don't need to add anything to the // global device ID. - let pci_device_bdf = pci.lock().expect("bad lock").next_device_id().unwrap() << 3; + let pci_device_id = pci.lock().expect("bad lock").next_device_id().unwrap(); + let pci_device_bdf = pci_device_id << 3; // Safe because we know the RawFd is valid. // @@ -222,7 +235,24 @@ fn add_vfio_device( let vfio_pci_device = - BusDevice::VfioPciDevice(VfioPciDevice::new(vm, vfio_device, vfio_container.clone(), &interrupt_manager, None, false).unwrap()); + BusDevice::VfioPciDevice(VfioPciDevice::new( + pci_device_id.to_string(), + vm, + vfio_device, + vfio_container.clone(), + interrupt_manager, + None, + false, + pci_device_bdf.into(), + Arc::new(move || { + static mut CURRENT: u32 = 1; + unsafe { + CURRENT += 1; + CURRENT + } + }), + None + ).unwrap()); let vfio_pci_device = Arc::new(Mutex::new(vfio_pci_device)); let bars = vfio_pci_device @@ -230,7 +260,12 @@ fn add_vfio_device( .expect("bad lock") .vfio_pci_device_mut() .unwrap() - .allocate_bars(&mut allocator.lock().expect("Poisoned lock")) + .allocate_bars( + &allocator, + &mut pci_mmio32_allocator.lock().unwrap(), + &mut pci_mmio64_allocator.lock().unwrap(), + None, + ) .unwrap(); // Register DMA mapping in IOMMU. @@ -335,8 +370,8 @@ fn create_vmm_and_vcpus( }, GuestAddress(0), mmio_address_space_size(46), - GuestAddress(crate::arch::MEM_32BIT_DEVICES_START), - crate::arch::MEM_32BIT_DEVICES_SIZE, + // GuestAddress(crate::arch::MEM_32BIT_DEVICES_START), + // crate::arch::MEM_32BIT_DEVICES_SIZE, #[cfg(target_arch = "x86_64")] vec![GsiApic::new( X86_64_IRQ_BASE, diff --git a/src/vmm/src/devices/pci.rs b/src/vmm/src/devices/pci.rs index 5f4536a44a1..8c59d5dcc87 100644 --- a/src/vmm/src/devices/pci.rs +++ b/src/vmm/src/devices/pci.rs @@ -1,22 +1,29 @@ // Copyright 2018 The Chromium OS Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use pci::configuration::{ - PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, -}; -use pci::device::PciDevice; -use byteorder::{ByteOrder, LittleEndian}; -use log::error; use std::any::Any; use std::collections::HashMap; -use std::fmt; use std::fmt::{Debug, Formatter}; +use std::ops::DerefMut; use std::sync::{Arc, Barrier, Mutex}; -use vm_memory::{Address, GuestAddress, GuestUsize}; + +use byteorder::{ByteOrder, LittleEndian}; + +use pci::configuration::{ + PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, +}; +use pci::device::{Error as PciDeviceError, PciDevice}; +use pci::PciBarConfiguration; use super::{Bus, BusDevice}; +use crate::logger::error; + +use std::fmt; + const VENDOR_ID_INTEL: u16 = 0x8086; const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; const NUM_DEVICE_IDS: usize = 32; @@ -24,8 +31,14 @@ const NUM_DEVICE_IDS: usize = 32; /// Errors for device manager. #[derive(Debug)] pub enum PciRootError { + /// Could not allocate device address space for the device. + AllocateDeviceAddrs(PciDeviceError), /// Could not allocate an IRQ number. AllocateIrq, + /// Could not add a device to the port io bus. + PioInsert(vm_device::BusError), + /// Could not add a device to the mmio bus. + MmioInsert(vm_device::BusError), /// Could not find an available device slot on the PCI bus. NoPciDeviceSlotAvailable, /// Invalid PCI device identifier provided. @@ -66,6 +79,7 @@ impl PciRoot { 0, 0, None, + None, ), } } @@ -90,6 +104,10 @@ impl PciDevice for PciRoot { fn as_any(&mut self) -> &mut dyn Any { self } + + fn id(&self) -> Option { + None + } } pub struct PciBus { @@ -115,23 +133,24 @@ impl PciBus { pub fn register_mapping( &self, - dev: Arc>, - io_bus: &mut Bus, + dev: Arc>, + #[cfg(target_arch = "x86_64")] io_bus: &mut Bus, mmio_bus: &mut Bus, - bars: Vec<(GuestAddress, GuestUsize, PciBarRegionType)>, + bars: Vec, ) -> Result<()> { - for (address, size, type_) in bars { - match type_ { + for bar in bars { + match bar.region_type() { PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] io_bus - .insert(dev.clone(), address.raw_value(), size) + .insert(dev.clone(), bar.addr(), bar.size()) .unwrap(); - error!("cannot register bus mappings {:x} {:x} IO", address.0, size); + error!("cannot register bus mappings {:x} {:x} IO", bar.addr(), bar.size()); } PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { - error!("Registering bus mappings {:x} {:x}", address.0, size); + error!("Registering bus mappings {:x} {:x}", bar.addr(), bar.size()); mmio_bus - .insert(dev.clone(), address.raw_value(), size) + .insert(dev.clone(), bar.addr(), bar.size()) .unwrap(); } } @@ -139,12 +158,8 @@ impl PciBus { Ok(()) } - pub fn add_device( - &mut self, - pci_device_bdf: u32, - device: Arc>, - ) -> Result<()> { - self.devices.insert(pci_device_bdf >> 3, device); + pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { + self.devices.insert(device_id, device); Ok(()) } @@ -203,8 +218,8 @@ impl Debug for PciConfigIo { impl PciConfigIo { pub fn new(pci_bus: Arc>) -> Self { PciConfigIo { - pci_bus, config_address: 0, + pci_bus, } } @@ -233,6 +248,7 @@ impl PciConfigIo { } self.pci_bus + .as_ref() .lock() .unwrap() .devices @@ -260,10 +276,12 @@ impl PciConfigIo { return None; } - let pci_bus = self.pci_bus.lock().unwrap(); + let pci_bus = self.pci_bus.as_ref().lock().unwrap(); if let Some(d) = pci_bus.devices.get(&(device as u32)) { let mut device = d.lock().unwrap(); + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. if let Some(params) = device.pci_device_mut().unwrap().detect_bar_reprogramming(register, data) { // if let Err(e) = pci_bus.device_reloc.move_bar( // params.old_base, @@ -272,11 +290,6 @@ impl PciConfigIo { // device.deref_mut(), // params.region_type, // ) { - // error!( - // "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", - // e, params.old_base, params.new_base, params.len - // ); - // } error!( "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", params.old_base, params.new_base, params.len @@ -397,6 +410,8 @@ impl PciConfigMmio { if let Some(d) = pci_bus.devices.get(&(device as u32)) { let mut device = d.lock().unwrap(); + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. if let Some(params) = device.pci_device_mut().unwrap().detect_bar_reprogramming(register, data) { // if let Err(e) = pci_bus.device_reloc.move_bar( // params.old_base, @@ -427,7 +442,7 @@ impl PciConfigMmio { // Only allow reads to the register boundary. let start = offset as usize % 4; let end = start + data.len(); - if end > 4 || offset > u64::from(u32::max_value()) { + if end > 4 || offset > u64::from(u32::MAX) { for d in data { *d = 0xff; } diff --git a/src/vmm/src/interrupt.rs b/src/vmm/src/interrupt.rs index 6b82a6cfca7..6218b27a9ff 100644 --- a/src/vmm/src/interrupt.rs +++ b/src/vmm/src/interrupt.rs @@ -3,8 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause // -// use devices::interrupt_controller::InterruptController; -// use hypervisor::IrqRoutingEntry; use std::collections::HashMap; use std::io; use std::sync::atomic::{AtomicBool, Ordering}; @@ -13,6 +11,7 @@ use vm_device::interrupt::{ InterruptIndex, InterruptManager, InterruptSourceConfig, InterruptSourceGroup, MsiIrqGroupConfig, }; use kvm_ioctls::{VmFd}; +use vmm_sys_util::eventfd::EventFd; /// Reuse std::io::Result to simplify interoperability among crates. pub type Result = std::io::Result; @@ -42,7 +41,7 @@ impl InterruptRoute { vm.register_irqfd(&self.irq_fd, self.gsi).map_err(|e| { io::Error::new( io::ErrorKind::Other, - format!("Failed registering irq_fd: {}", e), + format!("Failed registering irq_fd: {e}"), ) })?; @@ -58,7 +57,7 @@ impl InterruptRoute { vm.unregister_irqfd(&self.irq_fd, self.gsi).map_err(|e| { io::Error::new( io::ErrorKind::Other, - format!("Failed unregistering irq_fd: {}", e), + format!("Failed unregistering irq_fd: {e}"), ) })?; @@ -99,7 +98,7 @@ use vm_system_allocator::SystemAllocator; impl MsiInterruptGroup { fn set_gsi_routes(&self, routes: &HashMap>) -> Result<()> { let mut entry_vec: Vec = Vec::new(); - + for i in 0..24 { let mut kvm_route = kvm_irq_routing_entry { gsi: i, @@ -135,7 +134,7 @@ impl MsiInterruptGroup { self.vm.lock().expect("Poisoned VmFd lock").set_gsi_routing(&irq_routing[0]).map_err(|e| { io::Error::new( io::ErrorKind::Other, - format!("Failed setting GSI routing: {}", e), + format!("Failed setting GSI routing: {e}"), ) }) } @@ -191,63 +190,52 @@ impl InterruptSourceGroup for MsiInterruptGroup { None } - fn update(&self, index: InterruptIndex, config: InterruptSourceConfig) -> Result<()> { + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> Result<()> { if let Some(route) = self.irq_routes.get(&index) { let entry = RoutingEntry::<_>::make_entry(route.gsi, &config)?; - let mut routes = self.gsi_msi_routes.lock().unwrap(); - routes.insert(route.gsi, *entry); - return self.set_gsi_routes(&routes); - } - Err(io::Error::new( - io::ErrorKind::Other, - format!("update: Invalid interrupt index {}", index), - )) - } - fn mask(&self, index: InterruptIndex) -> Result<()> { - if let Some(route) = self.irq_routes.get(&index) { - let mut routes = self.gsi_msi_routes.lock().unwrap(); - if let Some(entry) = routes.get_mut(&route.gsi) { - entry.masked = true; - } else { - return Err(io::Error::new( - io::ErrorKind::Other, - format!("mask: No existing route for interrupt index {}", index), - )); + // When mask a msi irq, entry.masked is set to be true, + // and the gsi will not be passed to KVM through KVM_SET_GSI_ROUTING. + // So it's required to call disable() (which deassign KVM_IRQFD) before + // set_gsi_routes() to avoid kernel panic (see #3827) + if masked { + route.disable(&self.vm.lock().unwrap())?; } - self.set_gsi_routes(&routes)?; - return route.disable(&self.vm.lock().expect("Poisoned lock")); - } - Err(io::Error::new( - io::ErrorKind::Other, - format!("mask: Invalid interrupt index {}", index), - )) - } - - fn unmask(&self, index: InterruptIndex) -> Result<()> { - if let Some(route) = self.irq_routes.get(&index) { let mut routes = self.gsi_msi_routes.lock().unwrap(); - if let Some(entry) = routes.get_mut(&route.gsi) { - entry.masked = false; - } else { - return Err(io::Error::new( - io::ErrorKind::Other, - format!("mask: No existing route for interrupt index {}", index), - )); + routes.insert(route.gsi, *entry); + if set_gsi { + self.set_gsi_routes(&routes)?; } - self.set_gsi_routes(&routes)?; - return route.enable(&&self.vm.lock().expect("Poisoned lock")); + + // Assign KVM_IRQFD after KVM_SET_GSI_ROUTING to avoid + // panic on kernel which not have commit a80ced6ea514 + // (KVM: SVM: fix panic on out-of-bounds guest IRQ). + if !masked { + route.enable(&self.vm.lock().unwrap())?; + } + + return Ok(()); } Err(io::Error::new( io::ErrorKind::Other, - format!("unmask: Invalid interrupt index {}", index), + format!("update: Invalid interrupt index {index}"), )) } -} + fn set_gsi(&self) -> Result<()> { + let routes = self.gsi_msi_routes.lock().unwrap(); + self.set_gsi_routes(&routes) + } +} pub struct MsiInterruptManager { allocator: Arc>, vm: Arc>, @@ -273,10 +261,7 @@ impl MsiInterruptManager { impl InterruptManager for MsiInterruptManager { type GroupConfig = MsiIrqGroupConfig; - fn create_group( - &self, - config: Self::GroupConfig, - ) -> Result>> { + fn create_group(&self, config: Self::GroupConfig) -> Result> { let mut allocator = self.allocator.lock().unwrap(); let mut irq_routes: HashMap = HashMap::with_capacity(config.count as usize); @@ -284,14 +269,14 @@ impl InterruptManager for MsiInterruptManager { irq_routes.insert(i, InterruptRoute::new(&mut allocator)?); } - Ok(Arc::new(Box::new(MsiInterruptGroup::new( + Ok(Arc::new(MsiInterruptGroup::new( self.vm.clone(), self.gsi_msi_routes.clone(), irq_routes, - )))) + ))) } - fn destroy_group(&self, _group: Arc>) -> Result<()> { + fn destroy_group(&self, _group: Arc) -> Result<()> { Ok(()) } } From 2a00cdba51722720d6cf97048303ca3fe23737d8 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 16 Oct 2024 15:16:42 +0100 Subject: [PATCH 04/22] fix device id the `add_device` function expects the id, not the bdf of the device Signed-off-by: Riccardo Mancini --- Cargo.lock | 31 ++++++++++++++++--------------- src/vmm/src/builder.rs | 2 +- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7931eeaabda..209373e1950 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,7 +9,7 @@ dependencies = [ "displaydoc", "thiserror", "vm-memory", - "zerocopy 0.8.4", + "zerocopy 0.8.5", ] [[package]] @@ -131,11 +131,12 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-lc-fips-sys" -version = "0.12.12" +version = "0.12.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec4795bbabc13643a8b3532184041ab41dec5740046aa15734428219cb9a0bfc" +checksum = "bf12b67bc9c5168f68655aadb2a12081689a58f1d9b1484705e4d1810ed6e4ac" dependencies = [ "bindgen 0.69.5", + "cc", "cmake", "dunce", "fs_extra", @@ -266,9 +267,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.29" +version = "1.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58e804ac3194a48bb129643eb1d62fcc20d18c6b8c181704489353d13120bcd1" +checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945" dependencies = [ "jobserver", "libc", @@ -1571,9 +1572,9 @@ dependencies = [ [[package]] name = "uuid" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ "getrandom", "rand", @@ -1582,9 +1583,9 @@ dependencies = [ [[package]] name = "uuid-macro-internal" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee1cd046f83ea2c4e920d6ee9f7c3537ef928d75dce5d84a87c2c5d6b3999a3a" +checksum = "6b91f57fe13a38d0ce9e28a03463d8d3c2468ed03d75375110ec71d93b449a08" dependencies = [ "proc-macro2", "quote", @@ -1745,7 +1746,7 @@ dependencies = [ "vm-superio", "vm-system-allocator", "vmm-sys-util", - "zerocopy 0.8.4", + "zerocopy 0.8.5", ] [[package]] @@ -1922,11 +1923,11 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f39ef66148c23d1ab5acda9ae26d65b88050b79e2ef638e8b560f869d991775c" +checksum = "6129d25825e874589a0e529175dd060c13dab4f3d960c6a0b711e5535b598bb2" dependencies = [ - "zerocopy-derive 0.8.4", + "zerocopy-derive 0.8.5", ] [[package]] @@ -1942,9 +1943,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88ac5bbf101d2213edf0a2ee03242f5fa15be9907123e13dc770e21a0b5b670e" +checksum = "d917df3784b4e2f5deb708d14623b2c02833890e1aa7a5dd1088998e8e9402b1" dependencies = [ "proc-macro2", "quote", diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 26c05006811..3867d107200 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -303,7 +303,7 @@ fn add_vfio_device( pci.lock() .expect("bad lock") - .add_device(pci_device_bdf, vfio_pci_device.clone()) + .add_device(pci_device_id, vfio_pci_device.clone()) .unwrap(); pci.lock() From ae589a0eac423acda1e3a09533f9c0cdb3667b79 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Mon, 21 Oct 2024 16:41:18 +0100 Subject: [PATCH 05/22] pci poc: move back pcibus definition to pci crate cleanup up some of the mess. ideally we'd use the pci crate but the devices implementation between cloud-hypervisor and firecracker has drifted quite a bit Signed-off-by: Riccardo Mancini --- src/pci/src/bus.rs | 80 +++--- src/pci/src/lib.rs | 4 +- src/vmm/src/builder.rs | 74 ++++-- src/vmm/src/devices/bus.rs | 112 ++++++-- src/vmm/src/devices/mod.rs | 2 - src/vmm/src/devices/pci.rs | 508 ------------------------------------- src/vmm/src/lib.rs | 1 + 7 files changed, 185 insertions(+), 596 deletions(-) delete mode 100644 src/vmm/src/devices/pci.rs diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs index 906d8f683dc..fe70fe6d6fc 100644 --- a/src/pci/src/bus.rs +++ b/src/pci/src/bus.rs @@ -6,11 +6,11 @@ use std::any::Any; use std::collections::HashMap; +use std::fmt::Debug; use std::ops::DerefMut; use std::sync::{Arc, Barrier, Mutex}; use byteorder::{ByteOrder, LittleEndian}; -use vm_device::{Bus, BusDevice, BusDeviceSync}; use crate::configuration::{ PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, @@ -48,6 +48,14 @@ pub struct PciRoot { config: PciConfiguration, } +impl Debug for PciRoot { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciRoot") + // TODO + .finish() + } +} + impl PciRoot { /// Create an empty PCI root bridge. pub fn new(config: Option) -> Self { @@ -73,8 +81,6 @@ impl PciRoot { } } -impl BusDevice for PciRoot {} - impl PciDevice for PciRoot { fn write_config_register( &mut self, @@ -122,33 +128,6 @@ impl PciBus { } } - pub fn register_mapping( - &self, - dev: Arc, - #[cfg(target_arch = "x86_64")] io_bus: &Bus, - mmio_bus: &Bus, - bars: Vec, - ) -> Result<()> { - for bar in bars { - match bar.region_type() { - PciBarRegionType::IoRegion => { - #[cfg(target_arch = "x86_64")] - io_bus - .insert(dev.clone(), bar.addr(), bar.size()) - .map_err(PciRootError::PioInsert)?; - #[cfg(not(target_arch = "x86_64"))] - error!("I/O region is not supported"); - } - PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { - mmio_bus - .insert(dev.clone(), bar.addr(), bar.size()) - .map_err(PciRootError::MmioInsert)?; - } - } - } - Ok(()) - } - pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { self.devices.insert(device_id, device); Ok(()) @@ -199,6 +178,14 @@ pub struct PciConfigIo { pci_bus: Arc>, } +impl Debug for PciConfigIo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciConfigIo") + // TODO + .finish() + } +} + impl PciConfigIo { pub fn new(pci_bus: Arc>) -> Self { PciConfigIo { @@ -301,10 +288,8 @@ impl PciConfigIo { }; self.config_address = (self.config_address & !mask) | value; } -} -impl BusDevice for PciConfigIo { - fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + pub fn bus_read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { // `offset` is relative to 0xcf8 let value = match offset { 0..=3 => self.config_address, @@ -326,15 +311,12 @@ impl BusDevice for PciConfigIo { } } - fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + pub fn bus_write(&mut self, _base: u64, offset: u64, data: &[u8]) { // `offset` is relative to 0xcf8 match offset { - o @ 0..=3 => { - self.set_config_address(o, data); - None - } - o @ 4..=7 => self.config_space_write(o - 4, data), - _ => None, + o @ 0..=3 => self.set_config_address(o, data), + o @ 4..=7 => {self.config_space_write(o - 4, data);}, + _ => (), } } } @@ -344,6 +326,14 @@ pub struct PciConfigMmio { pci_bus: Arc>, } +impl Debug for PciConfigMmio { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciConfigMmio") + // TODO + .finish() + } +} + impl PciConfigMmio { pub fn new(pci_bus: Arc>) -> Self { PciConfigMmio { pci_bus } @@ -404,10 +394,8 @@ impl PciConfigMmio { device.write_config_register(register, offset, data); } } -} -impl BusDevice for PciConfigMmio { - fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + pub fn bus_read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { // Only allow reads to the register boundary. let start = offset as usize % 4; let end = start + data.len(); @@ -424,13 +412,11 @@ impl BusDevice for PciConfigMmio { } } - fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + pub fn bus_write(&mut self, _base: u64, offset: u64, data: &[u8]) { if offset > u64::from(u32::MAX) { - return None; + return; } self.config_space_write(offset as u32, offset % 4, data); - - None } } diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index 6d7c8272345..ece54d0bf80 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -8,7 +8,7 @@ #[macro_use] extern crate log; -// mod bus; +mod bus; pub mod configuration; pub mod device; pub mod msi; @@ -21,7 +21,7 @@ use std::str::FromStr; use serde::de::Visitor; -// pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; +pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; pub use self::configuration::{ PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 3867d107200..a956e184872 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -23,7 +23,7 @@ use linux_loader::loader::elf::Elf as Loader; #[cfg(target_arch = "aarch64")] use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::KernelLoader; -use pci::{PciDevice, VfioPciDevice}; +use pci::{DeviceRelocation, PciBarConfiguration, PciBarRegionType, PciDevice, VfioPciDevice}; use seccompiler::BpfThreadMap; use userfaultfd::Uffd; use utils::time::TimestampUs; @@ -51,7 +51,7 @@ use crate::cpu_config::templates::{ use crate::device_manager::acpi::ACPIDeviceManager; #[cfg(target_arch = "x86_64")] use crate::device_manager::legacy::PortIODeviceManager; -use crate::device_manager::mmio::MMIODeviceManager; +use crate::device_manager::mmio::{MMIODeviceManager, MmioError}; use crate::device_manager::persist::{ ACPIDeviceManagerConstructorArgs, ACPIDeviceManagerRestoreError, MMIODevManagerConstructorArgs, }; @@ -61,7 +61,7 @@ use crate::devices::legacy::serial::SerialOut; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; use crate::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; -use crate::devices::pci::{PciBus, PciConfigIo, PciConfigMmio, PciRoot}; +use pci::{PciBus, PciConfigIo, PciConfigMmio, PciRoot}; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; @@ -69,7 +69,7 @@ use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; -use crate::devices::BusDevice; +use crate::devices::{Bus, BusDevice}; #[cfg(feature = "gdb")] use crate::gdb; use crate::interrupt::MsiInterruptManager; @@ -171,6 +171,32 @@ fn create_passthrough_device(vm: &VmFd) -> DeviceFd { vm.create_device(&mut vfio_dev).unwrap() } +fn register_pci_device_mapping( + dev: Arc>, + #[cfg(target_arch = "x86_64")] io_bus: &mut Bus, + mmio_bus: &mut Bus, + bars: Vec, +) -> Result<(), VmmError> { + for bar in bars { + match bar.region_type() { + PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] + io_bus + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(|e| VmmError::DeviceManager(MmioError::BusInsert(e)))?; + #[cfg(not(target_arch = "x86_64"))] + error!("I/O region is not supported"); + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + mmio_bus + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(|e| VmmError::DeviceManager(MmioError::BusInsert(e)))?; + } + } + } + Ok(()) +} + fn add_vfio_device( vm: Arc>, fd: DeviceFd, @@ -285,7 +311,7 @@ fn add_vfio_device( region.len() as u64, // memory.get_host_address(region.start_addr()).unwrap() as u64, region.as_ptr() as u64 - ); + ).unwrap(); // vfio_container.vfio_dma_map( // region.start_addr().0, // region.len() as u64, @@ -306,16 +332,13 @@ fn add_vfio_device( .add_device(pci_device_id, vfio_pci_device.clone()) .unwrap(); - pci.lock() - .expect("bad lock") - .register_mapping( - vfio_pci_device.clone(), - #[cfg(target_arch = "x86_64")] - &mut pio_manager.io_bus, - &mut dev_manager.bus, - bars.clone(), - ) - .unwrap(); + register_pci_device_mapping( + vfio_pci_device.clone(), + #[cfg(target_arch = "x86_64")] + &mut pio_manager.io_bus, + &mut dev_manager.bus, + bars.clone(), + ).unwrap(); // Need to register bus mappings ? } @@ -329,6 +352,24 @@ fn mmio_address_space_size(phys_bits: u8) -> u64 { (1 << phys_bits) - (1 << 16) } +struct DummyDeviceRelocation; +impl DeviceRelocation for DummyDeviceRelocation { + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + _pci_dev: &mut dyn PciDevice, + _region_type: PciBarRegionType, + ) -> std::result::Result<(), io::Error> { + error!( + "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", + old_base, new_base, len + ); + Ok(()) + } +} + #[cfg_attr(target_arch = "aarch64", allow(unused))] fn create_vmm_and_vcpus( instance_info: &InstanceInfo, @@ -402,8 +443,7 @@ fn create_vmm_and_vcpus( // Instantiate ACPI device manager. let acpi_device_manager = ACPIDeviceManager::new(); - let pci_root = BusDevice::PciRoot(PciRoot::new(None)); - let pci_bus = PciBus::new(pci_root); + let pci_bus = PciBus::new(PciRoot::new(None), Arc::new(DummyDeviceRelocation{})); let pci_bus = Arc::new(Mutex::new(pci_bus)); diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index 25e1d239694..caf3c174ba1 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -7,9 +7,11 @@ //! Handles routing to devices in an address space. +use std::any::Any; use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; use std::collections::btree_map::BTreeMap; -use std::sync::{Arc, Mutex}; +use std::result::Result; +use std::sync::{Arc, Barrier, Mutex}; /// Errors triggered during bus operations. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -51,12 +53,15 @@ pub struct Bus { } use event_manager::{EventOps, Events, MutEventSubscriber}; -use pci::{PciDevice, VfioPciDevice}; +use pci::{BarReprogrammingParams, PciBarConfiguration, PciDevice, VfioPciDevice}; +use pci::device::Error as PciDeviceError; +use vm_device::Resource; +use vm_system_allocator::{AddressAllocator, SystemAllocator}; #[cfg(target_arch = "aarch64")] use super::legacy::RTCDevice; use super::legacy::{I8042Device, SerialDevice}; -use super::pci::{PciConfigIo, PciConfigMmio, PciRoot}; +use pci::{PciConfigIo, PciConfigMmio, PciRoot}; use super::pseudo::BootTimer; use super::virtio::mmio::MmioTransport; @@ -68,7 +73,6 @@ pub enum BusDevice { BootTimer(BootTimer), MmioTransport(MmioTransport), Serial(SerialDevice), - PciRoot(PciRoot), PioPciBus(PciConfigIo), MmioPciBus(PciConfigMmio), VfioPciDevice(VfioPciDevice), @@ -186,14 +190,12 @@ impl BusDevice { pub fn pci_device_ref(&self) -> Option<&dyn PciDevice> { match self { Self::VfioPciDevice(x) => Some(x), - Self::PciRoot(x) => Some(x), _ => None, } } pub fn pci_device_mut(&mut self) -> Option<&mut dyn PciDevice> { match self { Self::VfioPciDevice(x) => Some(x), - Self::PciRoot(x) => Some(x), _ => None, } } @@ -221,18 +223,6 @@ impl BusDevice { _ => None, } } - pub fn pci_root_ref(&self) -> Option<&PciRoot> { - match self { - Self::PciRoot(x) => Some(x), - _ => None, - } - } - pub fn pci_root_mut(&mut self) -> Option<&mut PciRoot> { - match self { - Self::PciRoot(x) => Some(x), - _ => None, - } - } pub fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { match self { @@ -245,7 +235,6 @@ impl BusDevice { Self::VfioPciDevice(x) => x.bus_read(base, offset, data), Self::MmioPciBus(x) => x.bus_read(base, offset, data), Self::PioPciBus(x) => x.bus_read(base, offset, data), - Self::PciRoot(x) => (), #[cfg(test)] Self::Dummy(x) => x.bus_read(offset, data), #[cfg(test)] @@ -264,7 +253,6 @@ impl BusDevice { Self::VfioPciDevice(x) => x.bus_write(base, offset, data), Self::MmioPciBus(x) => x.bus_write(base, offset, data), Self::PioPciBus(x) => x.bus_write(base, offset, data), - Self::PciRoot(x) => (), #[cfg(test)] Self::Dummy(x) => x.bus_write(offset, data), #[cfg(test)] @@ -273,6 +261,90 @@ impl BusDevice { } } +// TODO: hack to make pci crate compatible with firecracker BusDevices +type PciDeviceResult = Result; +impl PciDevice for BusDevice { + fn allocate_bars( + &mut self, + allocator: &Arc>, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> PciDeviceResult> { + self.pci_device_mut() + .unwrap() + .allocate_bars(allocator, mmio32_allocator, mmio64_allocator, resources) + } + + fn free_bars( + &mut self, + allocator: &mut SystemAllocator, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> PciDeviceResult<()> { + self.pci_device_mut() + .unwrap() + .free_bars(allocator, mmio32_allocator, mmio64_allocator) + } + + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.pci_device_mut() + .unwrap() + .write_config_register(reg_idx, offset, data) + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.pci_device_mut() + .unwrap() + .read_config_register(reg_idx) + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.pci_device_mut() + .unwrap() + .detect_bar_reprogramming(reg_idx, data) + } + + fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.pci_device_mut() + .unwrap() + .read_bar(base, offset, data) + } + + fn write_bar(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.pci_device_mut() + .unwrap() + .write_bar(base, offset, data) + } + + fn move_bar(&mut self, old_base: u64, new_base: u64) -> std::result::Result<(), std::io::Error> { + self.pci_device_mut() + .unwrap() + .move_bar(old_base, new_base) + } + + fn as_any(&mut self) -> &mut dyn Any { + self.pci_device_mut() + .unwrap() + .as_any() + } + + fn id(&self) -> Option { + self.pci_device_ref() + .unwrap() + .id() + } +} + impl MutEventSubscriber for BusDevice { fn process(&mut self, event: Events, ops: &mut EventOps) { match self { diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 656644da60c..0ca445b6f82 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -14,8 +14,6 @@ pub mod bus; pub mod legacy; pub mod pseudo; pub mod virtio; -/// PCI Devices -pub mod pci; pub use bus::{Bus, BusDevice, BusError}; use log::error; diff --git a/src/vmm/src/devices/pci.rs b/src/vmm/src/devices/pci.rs deleted file mode 100644 index 8c59d5dcc87..00000000000 --- a/src/vmm/src/devices/pci.rs +++ /dev/null @@ -1,508 +0,0 @@ -// Copyright 2018 The Chromium OS Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE-BSD-3-Clause file. -// -// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause - -use std::any::Any; -use std::collections::HashMap; -use std::fmt::{Debug, Formatter}; -use std::ops::DerefMut; -use std::sync::{Arc, Barrier, Mutex}; - -use byteorder::{ByteOrder, LittleEndian}; - -use pci::configuration::{ - PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, -}; -use pci::device::{Error as PciDeviceError, PciDevice}; -use pci::PciBarConfiguration; - -use super::{Bus, BusDevice}; - -use crate::logger::error; - -use std::fmt; - -const VENDOR_ID_INTEL: u16 = 0x8086; -const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; -const NUM_DEVICE_IDS: usize = 32; - -/// Errors for device manager. -#[derive(Debug)] -pub enum PciRootError { - /// Could not allocate device address space for the device. - AllocateDeviceAddrs(PciDeviceError), - /// Could not allocate an IRQ number. - AllocateIrq, - /// Could not add a device to the port io bus. - PioInsert(vm_device::BusError), - /// Could not add a device to the mmio bus. - MmioInsert(vm_device::BusError), - /// Could not find an available device slot on the PCI bus. - NoPciDeviceSlotAvailable, - /// Invalid PCI device identifier provided. - InvalidPciDeviceSlot(usize), - /// Valid PCI device identifier but already used. - AlreadyInUsePciDeviceSlot(usize), -} -pub type Result = std::result::Result; - -/// Emulates the PCI Root bridge device. -pub struct PciRoot { - /// Configuration space. - config: PciConfiguration, -} - -impl Debug for PciRoot { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - f.debug_struct("PciRoot") - .finish() - } -} - -impl PciRoot { - /// Create an empty PCI root bridge. - pub fn new(config: Option) -> Self { - if let Some(config) = config { - PciRoot { config } - } else { - PciRoot { - config: PciConfiguration::new( - VENDOR_ID_INTEL, - DEVICE_ID_INTEL_VIRT_PCIE_HOST, - 0, - PciClassCode::BridgeDevice, - &PciBridgeSubclass::HostBridge, - None, - PciHeaderType::Device, - 0, - 0, - None, - None, - ), - } - } - } -} - -impl PciDevice for PciRoot { - fn write_config_register( - &mut self, - reg_idx: usize, - offset: u64, - data: &[u8], - ) -> Option> { - self.config.write_config_register(reg_idx, offset, data); - None - } - - fn read_config_register(&mut self, reg_idx: usize) -> u32 { - self.config.read_reg(reg_idx) - } - - fn as_any(&mut self) -> &mut dyn Any { - self - } - - fn id(&self) -> Option { - None - } -} - -pub struct PciBus { - /// Devices attached to this bus. - /// Device 0 is host bridge. - devices: HashMap>>, - device_ids: Vec, -} - -impl PciBus { - pub fn new(pci_root: BusDevice) -> Self { - let mut devices: HashMap>> = HashMap::new(); - let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; - - devices.insert(0, Arc::new(Mutex::new(pci_root))); - device_ids[0] = true; - - PciBus { - devices, - device_ids, - } - } - - pub fn register_mapping( - &self, - dev: Arc>, - #[cfg(target_arch = "x86_64")] io_bus: &mut Bus, - mmio_bus: &mut Bus, - bars: Vec, - ) -> Result<()> { - for bar in bars { - match bar.region_type() { - PciBarRegionType::IoRegion => { - #[cfg(target_arch = "x86_64")] - io_bus - .insert(dev.clone(), bar.addr(), bar.size()) - .unwrap(); - error!("cannot register bus mappings {:x} {:x} IO", bar.addr(), bar.size()); - } - PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { - error!("Registering bus mappings {:x} {:x}", bar.addr(), bar.size()); - mmio_bus - .insert(dev.clone(), bar.addr(), bar.size()) - .unwrap(); - } - } - } - Ok(()) - } - - pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { - self.devices.insert(device_id, device); - Ok(()) - } - - pub fn remove_by_device(&mut self, device: &Arc>) -> Result<()> { - self.devices.retain(|_, dev| !Arc::ptr_eq(dev, device)); - Ok(()) - } - - pub fn next_device_id(&mut self) -> Result { - for (idx, device_id) in self.device_ids.iter_mut().enumerate() { - if !(*device_id) { - *device_id = true; - return Ok(idx as u32); - } - } - - Err(PciRootError::NoPciDeviceSlotAvailable) - } - - pub fn get_device_id(&mut self, id: usize) -> Result<()> { - if id < NUM_DEVICE_IDS { - if !self.device_ids[id] { - self.device_ids[id] = true; - Ok(()) - } else { - Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) - } - } else { - Err(PciRootError::InvalidPciDeviceSlot(id)) - } - } - - pub fn put_device_id(&mut self, id: usize) -> Result<()> { - if id < NUM_DEVICE_IDS { - self.device_ids[id] = false; - Ok(()) - } else { - Err(PciRootError::InvalidPciDeviceSlot(id)) - } - } -} - -pub struct PciConfigIo { - /// Config space register. - config_address: u32, - pci_bus: Arc>, -} - -impl Debug for PciConfigIo { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - f.debug_struct("PciConfigIo") - .finish() - } -} - -impl PciConfigIo { - pub fn new(pci_bus: Arc>) -> Self { - PciConfigIo { - config_address: 0, - pci_bus, - } - } - - pub fn config_space_read(&self) -> u32 { - let enabled = (self.config_address & 0x8000_0000) != 0; - if !enabled { - return 0xffff_ffff; - } - - let (bus, device, function, register) = - parse_io_config_address(self.config_address & !0x8000_0000); - - error!( - "config space read {}:{}:{} reg {}", - bus, device, function, register - ); - - // Only support one bus. - if bus != 0 { - return 0xffff_ffff; - } - - // Don't support multi-function devices. - if function > 0 { - return 0xffff_ffff; - } - - self.pci_bus - .as_ref() - .lock() - .unwrap() - .devices - .get(&(device as u32)) - .map_or(0xffff_ffff, |d| { - d.lock().unwrap().pci_device_mut().unwrap().read_config_register(register) - }) - } - - pub fn config_space_write(&mut self, offset: u64, data: &[u8]) -> Option> { - if offset as usize + data.len() > 4 { - return None; - } - - let enabled = (self.config_address & 0x8000_0000) != 0; - if !enabled { - return None; - } - - let (bus, device, _function, register) = - parse_io_config_address(self.config_address & !0x8000_0000); - - // Only support one bus. - if bus != 0 { - return None; - } - - let pci_bus = self.pci_bus.as_ref().lock().unwrap(); - if let Some(d) = pci_bus.devices.get(&(device as u32)) { - let mut device = d.lock().unwrap(); - - // Find out if one of the device's BAR is being reprogrammed, and - // reprogram it if needed. - if let Some(params) = device.pci_device_mut().unwrap().detect_bar_reprogramming(register, data) { - // if let Err(e) = pci_bus.device_reloc.move_bar( - // params.old_base, - // params.new_base, - // params.len, - // device.deref_mut(), - // params.region_type, - // ) { - error!( - "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", - params.old_base, params.new_base, params.len - ); - } - // Update the register value - device.pci_device_mut().unwrap().write_config_register(register, offset, data) - } else { - None - } - } - - fn set_config_address(&mut self, offset: u64, data: &[u8]) { - if offset as usize + data.len() > 4 { - return; - } - let (mask, value): (u32, u32) = match data.len() { - 1 => ( - 0x0000_00ff << (offset * 8), - u32::from(data[0]) << (offset * 8), - ), - 2 => ( - 0x0000_ffff << (offset * 16), - (u32::from(data[1]) << 8 | u32::from(data[0])) << (offset * 16), - ), - 4 => (0xffff_ffff, LittleEndian::read_u32(data)), - _ => return, - }; - self.config_address = (self.config_address & !mask) | value; - } -} - -impl PciConfigIo { - pub fn bus_read(&mut self, _: u64, offset: u64, data: &mut [u8]) { - // `offset` is relative to 0xcf8 - let value = match offset { - 0..=3 => self.config_address, - 4..=7 => self.config_space_read(), - _ => 0xffff_ffff, - }; - - // Only allow reads to the register boundary. - let start = offset as usize % 4; - let end = start + data.len(); - if end <= 4 { - for i in start..end { - data[i - start] = (value >> (i * 8)) as u8; - } - } else { - for d in data { - *d = 0xff; - } - } - } - - pub fn bus_write(&mut self, _: u64, offset: u64, data: &[u8]) { - // `offset` is relative to 0xcf8 - match offset { - o @ 0..=3 => { - self.set_config_address(o, data); - } - o @ 4..=7 => { - self.config_space_write(o - 4, data); - } - _ => {} - } - } -} - -/// Emulates PCI memory-mapped configuration access mechanism. -pub struct PciConfigMmio { - pci_bus: Arc>, -} - -impl Debug for PciConfigMmio { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - f.debug_struct("PciConfigMmio") - .finish() - } -} - -impl PciConfigMmio { - pub fn new(pci_bus: Arc>) -> Self { - PciConfigMmio { pci_bus } - } - - fn config_space_read(&self, config_address: u32) -> u32 { - let (bus, device, _function, register) = parse_mmio_config_address(config_address); - - // Only support one bus. - if bus != 0 { - return 0xffff_ffff; - } - - self.pci_bus - .lock() - .unwrap() - .devices - .get(&(device as u32)) - .map_or(0xffff_ffff, |d| { - d.lock().unwrap().pci_device_mut().unwrap().read_config_register(register) - }) - } - - fn config_space_write(&mut self, config_address: u32, offset: u64, data: &[u8]) { - if offset as usize + data.len() > 4 { - return; - } - - let (bus, device, _function, register) = parse_mmio_config_address(config_address); - - // Only support one bus. - if bus != 0 { - return; - } - - let pci_bus = self.pci_bus.lock().unwrap(); - if let Some(d) = pci_bus.devices.get(&(device as u32)) { - let mut device = d.lock().unwrap(); - - // Find out if one of the device's BAR is being reprogrammed, and - // reprogram it if needed. - if let Some(params) = device.pci_device_mut().unwrap().detect_bar_reprogramming(register, data) { - // if let Err(e) = pci_bus.device_reloc.move_bar( - // params.old_base, - // params.new_base, - // params.len, - // device.deref_mut(), - // params.region_type, - // ) { - // error!( - // "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", - // e, params.old_base, params.new_base, params.len - // ); - // } - error!( - "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", - params.old_base, params.new_base, params.len - ); - } - - // Update the register value - device.pci_device_mut().unwrap().write_config_register(register, offset, data); - } - } -} - -impl PciConfigMmio { - pub fn bus_read(&mut self, _: u64, offset: u64, data: &mut [u8]) { - // Only allow reads to the register boundary. - let start = offset as usize % 4; - let end = start + data.len(); - if end > 4 || offset > u64::from(u32::MAX) { - for d in data { - *d = 0xff; - } - return; - } - - let value = self.config_space_read(offset as u32); - for i in start..end { - data[i - start] = (value >> (i * 8)) as u8; - } - } - - pub fn bus_write(&mut self, _: u64, offset: u64, data: &[u8]) { - if offset > u64::from(u32::max_value()) { - return; - } - self.config_space_write(offset as u32, offset % 4, data); - } -} - -fn shift_and_mask(value: u32, offset: usize, mask: u32) -> usize { - ((value >> offset) & mask) as usize -} - -// Parse the MMIO address offset to a (bus, device, function, register) tuple. -// See section 7.2.2 PCI Express Enhanced Configuration Access Mechanism (ECAM) -// from the Pci Express Base Specification Revision 5.0 Version 1.0. -fn parse_mmio_config_address(config_address: u32) -> (usize, usize, usize, usize) { - const BUS_NUMBER_OFFSET: usize = 20; - const BUS_NUMBER_MASK: u32 = 0x00ff; - const DEVICE_NUMBER_OFFSET: usize = 15; - const DEVICE_NUMBER_MASK: u32 = 0x1f; - const FUNCTION_NUMBER_OFFSET: usize = 12; - const FUNCTION_NUMBER_MASK: u32 = 0x07; - const REGISTER_NUMBER_OFFSET: usize = 2; - const REGISTER_NUMBER_MASK: u32 = 0x3ff; - - ( - shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), - shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), - shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), - shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), - ) -} - -// Parse the CONFIG_ADDRESS register to a (bus, device, function, register) tuple. -fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) { - const BUS_NUMBER_OFFSET: usize = 16; - const BUS_NUMBER_MASK: u32 = 0x00ff; - const DEVICE_NUMBER_OFFSET: usize = 11; - const DEVICE_NUMBER_MASK: u32 = 0x1f; - const FUNCTION_NUMBER_OFFSET: usize = 8; - const FUNCTION_NUMBER_MASK: u32 = 0x07; - const REGISTER_NUMBER_OFFSET: usize = 2; - const REGISTER_NUMBER_MASK: u32 = 0x3f; - - ( - shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), - shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), - shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), - shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), - ) -} diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 7825de4459a..44e428fee80 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -110,6 +110,7 @@ pub mod utils; pub mod vmm_config; /// Module with virtual state structs. pub mod vstate; +/// TODO: Module for MSI interrupts pub mod interrupt; use std::collections::HashMap; From 9e99193c53bd90676e39146f27068b311af17f9f Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 22 Oct 2024 10:52:39 +0100 Subject: [PATCH 06/22] mark MMCONFIG area as Reserved in E820 Signed-off-by: Riccardo Mancini --- src/vmm/src/arch/x86_64/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index 1066d9734c3..d7e0eb2a8ab 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -21,6 +21,7 @@ pub mod regs; #[allow(missing_docs)] pub mod gen; +use layout::PCI_MMCONFIG_SIZE; use linux_loader::configurator::linux::LinuxBootConfigurator; use linux_loader::configurator::{BootConfigurator, BootParams}; use linux_loader::loader::bootparam::boot_params; @@ -158,6 +159,8 @@ pub fn configure_system( E820_RESERVED, )?; + add_e820_entry(&mut params, layout::PCI_MMCONFIG_START, PCI_MMCONFIG_SIZE, E820_RESERVED)?; + let last_addr = guest_mem.last_addr(); if last_addr < end_32bit_gap_start { add_e820_entry( From c40e979ed3632d9b8f7ac142120b68dd5a6b768b Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 22 Oct 2024 10:55:05 +0100 Subject: [PATCH 07/22] add MCFG table to discover PCI root via ACPI Signed-off-by: Riccardo Mancini --- src/acpi-tables/src/lib.rs | 2 + src/acpi-tables/src/mcfg.rs | 78 +++++++++++++++++++++++++++++++++++++ src/vmm/src/acpi/mod.rs | 23 +++++++++-- src/vmm/src/builder.rs | 3 +- 4 files changed, 101 insertions(+), 5 deletions(-) create mode 100644 src/acpi-tables/src/mcfg.rs diff --git a/src/acpi-tables/src/lib.rs b/src/acpi-tables/src/lib.rs index 301a2d1cc95..ee24b1a2a60 100644 --- a/src/acpi-tables/src/lib.rs +++ b/src/acpi-tables/src/lib.rs @@ -10,6 +10,7 @@ pub mod aml; pub mod dsdt; pub mod fadt; pub mod madt; +pub mod mcfg; pub mod rsdp; pub mod xsdt; @@ -17,6 +18,7 @@ pub use aml::Aml; pub use dsdt::Dsdt; pub use fadt::Fadt; pub use madt::Madt; +pub use mcfg::Mcfg; pub use rsdp::Rsdp; pub use xsdt::Xsdt; use zerocopy::little_endian::{U32, U64}; diff --git a/src/acpi-tables/src/mcfg.rs b/src/acpi-tables/src/mcfg.rs new file mode 100644 index 00000000000..e914f0cae7b --- /dev/null +++ b/src/acpi-tables/src/mcfg.rs @@ -0,0 +1,78 @@ +// Copyright © 2019 Intel Corporation +// Copyright © 2023 Rivos, Inc. +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::mem::size_of; +use vm_memory::{Bytes, GuestAddress, GuestMemory}; +use zerocopy::{Immutable, IntoBytes}; + +use crate::{checksum, Result, Sdt, SdtHeader}; + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Default, Debug, IntoBytes, Clone, Copy, Immutable)] +struct PciRangeEntry { + pub base_address: u64, + pub segment: u16, + pub start: u8, + pub end: u8, + _reserved: u32, +} + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Debug, Default, IntoBytes, Immutable)] +pub struct Mcfg { + header: SdtHeader, + _reserved: u64, + pci_range_entry: PciRangeEntry, +} + +impl Mcfg { + pub fn new( + oem_id: [u8; 6], + oem_table_id: [u8; 8], + oem_revision: u32, + pci_mmio_config_addr: u64 + ) -> Self { + let header = SdtHeader::new( + *b"MCFG", + size_of::() + .try_into() + .unwrap(), + 1, + oem_id, + oem_table_id, + oem_revision, + ); + + let mut mcfg = Mcfg { + header, + pci_range_entry: PciRangeEntry { + base_address: pci_mmio_config_addr, + segment: 0, + start: 0, + end: 0, + ..Default::default() + }, + ..Default::default() + }; + + mcfg.header.checksum = checksum(&[mcfg.as_bytes()]); + + mcfg + } +} + +impl Sdt for Mcfg { + fn len(&self) -> usize { + self.as_bytes().len() + } + + fn write_to_guest(&mut self, mem: &M, address: GuestAddress) -> Result<()> { + mem.write_slice(self.as_bytes(), address)?; + Ok(()) + } +} diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 75de9edfebc..102dbdc53f3 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::fadt::{FADT_F_HW_REDUCED_ACPI, FADT_F_PWR_BUTTON, FADT_F_SLP_BUTTON}; -use acpi_tables::{Aml, Dsdt, Fadt, Madt, Rsdp, Sdt, Xsdt}; +use acpi_tables::{Aml, Dsdt, Fadt, Madt, Mcfg, Rsdp, Sdt, Xsdt}; use log::{debug, error}; use vm_allocator::AllocPolicy; @@ -126,16 +126,29 @@ impl<'a> AcpiTableWriter<'a> { /// Build the XSDT table for the guest /// /// Currently, we pass to the guest just FADT and MADT tables. - fn build_xsdt(&mut self, fadt_addr: u64, madt_addr: u64) -> Result { + fn build_xsdt(&mut self, fadt_addr: u64, madt_addr: u64, mcfg_addr: u64) -> Result { let mut xsdt = Xsdt::new( OEM_ID, *b"FCMVXSDT", OEM_REVISION, - vec![fadt_addr, madt_addr], + vec![fadt_addr, madt_addr, mcfg_addr], ); self.write_acpi_table(&mut xsdt) } + /// Build the XSDT table for the guest + /// + /// Currently, we pass to the guest just FADT and MADT tables. + fn build_mcfg(&mut self, pci_mmio_config_addr: u64) -> Result { + let mut mcfg = Mcfg::new( + OEM_ID, + *b"CHMCFG ", + OEM_REVISION, + pci_mmio_config_addr, + ); + self.write_acpi_table(&mut mcfg) + } + /// Build the RSDP pointer for the guest. /// /// This will build the RSDP pointer which points to the XSDT table and write it in guest @@ -164,6 +177,7 @@ pub(crate) fn create_acpi_tables( resource_allocator: &mut ResourceAllocator, mmio_device_manager: &MMIODeviceManager, acpi_device_manager: &ACPIDeviceManager, + pci_mmio_config_addr: u64, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { @@ -174,7 +188,8 @@ pub(crate) fn create_acpi_tables( let dsdt_addr = writer.build_dsdt(mmio_device_manager, acpi_device_manager)?; let fadt_addr = writer.build_fadt(dsdt_addr)?; let madt_addr = writer.build_madt(vcpus.len().try_into().unwrap())?; - let xsdt_addr = writer.build_xsdt(fadt_addr, madt_addr)?; + let mcfg_addr = writer.build_mcfg(pci_mmio_config_addr)?; + let xsdt_addr = writer.build_xsdt(fadt_addr, madt_addr, mcfg_addr)?; writer.build_rsdp(xsdt_addr) } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index a956e184872..ec9d3fbfb31 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -41,7 +41,7 @@ compile_error!("GDB feature not supported on ARM"); #[cfg(target_arch = "x86_64")] use crate::acpi; -use crate::arch::{InitrdConfig, MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START}; +use crate::arch::{InitrdConfig, MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, PCI_MMCONFIG_START}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{ @@ -1124,6 +1124,7 @@ pub fn configure_system_for_boot( &mut vmm.resource_allocator, &vmm.mmio_device_manager, &vmm.acpi_device_manager, + PCI_MMCONFIG_START, vcpus, )?; } From 2622a4a0c3882a55b010ef28e38933590caa807e Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 22 Oct 2024 10:58:52 +0100 Subject: [PATCH 08/22] introduce PciSegment abstraction This will be useful to generate the AML code for the DSDT table in ACPI. Signed-off-by: Riccardo Mancini --- Cargo.lock | 2 + src/vmm/Cargo.toml | 2 + src/vmm/src/arch/x86_64/layout.rs | 4 +- src/vmm/src/builder.rs | 106 ++++--- src/vmm/src/devices/mod.rs | 1 + src/vmm/src/devices/pci_segment.rs | 491 +++++++++++++++++++++++++++++ src/vmm/src/lib.rs | 2 + 7 files changed, 561 insertions(+), 47 deletions(-) create mode 100644 src/vmm/src/devices/pci_segment.rs diff --git a/Cargo.lock b/Cargo.lock index 209373e1950..b605eb07685 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1702,6 +1702,7 @@ version = "0.1.0" dependencies = [ "acpi_tables", "aes-gcm", + "anyhow", "arrayvec", "aws-lc-rs", "base64", @@ -1737,6 +1738,7 @@ dependencies = [ "timerfd", "userfaultfd", "utils", + "uuid", "vfio-ioctls", "vhost", "vm-allocator", diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index f367f9d6e3f..16d7e7d3716 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -9,6 +9,7 @@ license = "Apache-2.0" bench = false [dependencies] +anyhow = "1.0.87" acpi_tables = { path = "../acpi-tables" } aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } arrayvec = { version = "0.7.6", optional = true } @@ -43,6 +44,7 @@ thiserror = "1.0.64" timerfd = "1.5.0" userfaultfd = "0.8.1" utils = { path = "../utils" } +uuid = "1.8.0" vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", branch = "main" } vhost = { version = "0.12.0", features = ["vhost-user-frontend"] } vm-allocator = "0.1.0" diff --git a/src/vmm/src/arch/x86_64/layout.rs b/src/vmm/src/arch/x86_64/layout.rs index 74a61149237..7ed1384f3b5 100644 --- a/src/vmm/src/arch/x86_64/layout.rs +++ b/src/vmm/src/arch/x86_64/layout.rs @@ -84,4 +84,6 @@ pub const MEM_32BIT_DEVICES_SIZE: u64 = 640 << 20; /// PCI_MMCONFIG_START pub const PCI_MMCONFIG_START: u64 = MEM_32BIT_DEVICES_START + MEM_32BIT_DEVICES_SIZE; /// PCI_MMCONFIG_SIZE -pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; \ No newline at end of file +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; +/// PCI_MMIO_CONFIG_SIZE_PER_SEGMENT +pub const PCI_MMIO_CONFIG_SIZE_PER_SEGMENT: u64 = 4096 * 256; \ No newline at end of file diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index ec9d3fbfb31..240e96b29c8 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -61,6 +61,7 @@ use crate::devices::legacy::serial::SerialOut; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; use crate::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; +use crate::devices::pci_segment::PciSegment; use pci::{PciBus, PciConfigIo, PciConfigMmio, PciRoot}; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; @@ -200,32 +201,20 @@ fn register_pci_device_mapping( fn add_vfio_device( vm: Arc>, fd: DeviceFd, - pci: Arc>, + pci_segment: &PciSegment, dev_manager: &mut MMIODeviceManager, pio_manager: &mut PortIODeviceManager, interrupt_manager: Arc>, memory: GuestMemoryMmap, allocator: Arc> ) { - // alignment 4 << 10 - let pci_mmio32_allocator = Arc::new(Mutex::new( - AddressAllocator::new(GuestAddress(MEM_32BIT_DEVICES_START), MEM_32BIT_DEVICES_SIZE).unwrap(), - )); - - // alignment 4 << 30 - let pci_mmio64_allocator = Arc::new(Mutex::new( - AddressAllocator::new( - GuestAddress(0), - mmio_address_space_size(46), - ).unwrap() - )); // We need to shift the device id since the 3 first bits // are dedicated to the PCI function, and we know we don't // do multifunction. Also, because we only support one PCI // bus, the bus 0, we don't need to add anything to the // global device ID. - let pci_device_id = pci.lock().expect("bad lock").next_device_id().unwrap(); + let pci_device_id = pci_segment.pci_bus.lock().expect("bad lock").next_device_id().unwrap(); let pci_device_bdf = pci_device_id << 3; // Safe because we know the RawFd is valid. @@ -288,8 +277,8 @@ fn add_vfio_device( .unwrap() .allocate_bars( &allocator, - &mut pci_mmio32_allocator.lock().unwrap(), - &mut pci_mmio64_allocator.lock().unwrap(), + &mut pci_segment.mem32_allocator.lock().unwrap(), + &mut pci_segment.mem64_allocator.lock().unwrap(), None, ) .unwrap(); @@ -327,7 +316,7 @@ fn add_vfio_device( .map_mmio_regions() .unwrap(); - pci.lock() + pci_segment.pci_bus.lock() .expect("bad lock") .add_device(pci_device_id, vfio_pci_device.clone()) .unwrap(); @@ -352,24 +341,6 @@ fn mmio_address_space_size(phys_bits: u8) -> u64 { (1 << phys_bits) - (1 << 16) } -struct DummyDeviceRelocation; -impl DeviceRelocation for DummyDeviceRelocation { - fn move_bar( - &self, - old_base: u64, - new_base: u64, - len: u64, - _pci_dev: &mut dyn PciDevice, - _region_type: PciBarRegionType, - ) -> std::result::Result<(), io::Error> { - error!( - "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", - old_base, new_base, len - ); - Ok(()) - } -} - #[cfg_attr(target_arch = "aarch64", allow(unused))] fn create_vmm_and_vcpus( instance_info: &InstanceInfo, @@ -443,9 +414,35 @@ fn create_vmm_and_vcpus( // Instantiate ACPI device manager. let acpi_device_manager = ACPIDeviceManager::new(); - let pci_bus = PciBus::new(PciRoot::new(None), Arc::new(DummyDeviceRelocation{})); + // alignment 4 << 10 + let pci_mmio32_allocator = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(MEM_32BIT_DEVICES_START), MEM_32BIT_DEVICES_SIZE).unwrap(), + )); + + // alignment 4 << 30 + let pci_mmio64_allocator = Arc::new(Mutex::new( + AddressAllocator::new( + GuestAddress(0), + mmio_address_space_size(46), + ).unwrap() + )); - let pci_bus = Arc::new(Mutex::new(pci_bus)); + // TODO: allocate GSI for legacy interrupts + // let irqs = resource_allocator.allocate_gsi(8).unwrap(); + // let mut pci_irq_slots: [u8; 32] = [0; 32]; + // for i in 0..32 { + // pci_irq_slots[i] = irqs[i % 8] as u8; + // } + let pci_irq_slots: [u8; 32] = [(NUM_IOAPIC_PINS-1) as u8; 32]; + + let pci_segment = PciSegment::new( + 0, + 0, + pci_mmio32_allocator, + pci_mmio64_allocator, + &mut mmio_device_manager.bus, + &pci_irq_slots, + ).unwrap(); // For x86_64 we need to create the interrupt controller before calling `KVM_CREATE_VCPUS` // while on aarch64 we need to do it the other way around. @@ -467,7 +464,7 @@ fn create_vmm_and_vcpus( .map_err(VmmError::EventFd) .map_err(Internal)?; - let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&pci_bus))))); + let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&pci_segment.pci_bus))))); // create pio dev manager with legacy devices @@ -487,19 +484,13 @@ fn create_vmm_and_vcpus( add_vfio_device( Arc::clone(&vm_fd), device_fd, - Arc::clone(&pci_bus), + &pci_segment, &mut mmio_device_manager, &mut pio_device_manager, Arc::clone(&msi_interrupt_manager), guest_memory.clone(), Arc::clone(&allocator) ); - - - let pci_config_mmio = Arc::new(Mutex::new(BusDevice::MmioPciBus(PciConfigMmio::new(Arc::clone(&pci_bus))))); - mmio_device_manager - .register_pci_bus(pci_config_mmio) - .unwrap(); // On aarch64, the vCPUs need to be created (i.e call KVM_CREATE_VCPU) before setting up the // IRQ chip because the `KVM_CREATE_VCPU` ioctl will return error if the IRQCHIP @@ -526,6 +517,7 @@ fn create_vmm_and_vcpus( #[cfg(target_arch = "x86_64")] pio_device_manager, acpi_device_manager, + pci_segment, }; Ok((vmm, vcpus)) @@ -1399,7 +1391,7 @@ pub mod tests { let (mut vm, _) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_memory, false).unwrap(); - let mmio_device_manager = MMIODeviceManager::new(); + let mut mmio_device_manager = MMIODeviceManager::new(); let acpi_device_manager = ACPIDeviceManager::new(); let pci_bus = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice{}))); #[cfg(target_arch = "x86_64")] @@ -1429,6 +1421,27 @@ pub mod tests { setup_interrupt_controller(&mut vm, 1).unwrap(); } + let pci_mmio32_allocator = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(MEM_32BIT_DEVICES_START), MEM_32BIT_DEVICES_SIZE).unwrap(), + )); + + // alignment 4 << 30 + let pci_mmio64_allocator = Arc::new(Mutex::new( + AddressAllocator::new( + GuestAddress(0), + mmio_address_space_size(46), + ).unwrap() + )); + + let pci_segment = PciSegment::new( + 0, + 0, + pci_mmio32_allocator, + pci_mmio64_allocator, + &mut mmio_device_manager.bus, + &[0u8; 32], + ).unwrap(); + Vmm { events_observer: Some(std::io::stdin()), instance_info: InstanceInfo::default(), @@ -1443,6 +1456,7 @@ pub mod tests { #[cfg(target_arch = "x86_64")] pio_device_manager, acpi_device_manager, + pci_segment, } } diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 0ca445b6f82..eb670a0d47c 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -14,6 +14,7 @@ pub mod bus; pub mod legacy; pub mod pseudo; pub mod virtio; +pub mod pci_segment; pub use bus::{Bus, BusDevice, BusError}; use log::error; diff --git a/src/vmm/src/devices/pci_segment.rs b/src/vmm/src/devices/pci_segment.rs new file mode 100644 index 00000000000..dc8fd9ed8c0 --- /dev/null +++ b/src/vmm/src/devices/pci_segment.rs @@ -0,0 +1,491 @@ +// Portions Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 - 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +// + +use std::{fmt::Debug, io, sync::{Arc, Mutex}}; + +use acpi_tables::{aml, Aml}; +use anyhow::{anyhow, Result}; +use pci::{DeviceRelocation, PciBarRegionType, PciBdf, PciBus, PciConfigMmio, PciDevice, PciRoot}; +#[cfg(target_arch = "x86_64")] +use pci::{PciConfigIo, PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE}; +use uuid::Uuid; +use vm_system_allocator::{AddressAllocator, SystemAllocator}; + +use crate::{arch::x86_64::layout, devices::BusDevice, logger::{error, info}}; + +use super::Bus; + +pub(crate) struct PciSegment { + pub(crate) id: u16, + pub(crate) pci_bus: Arc>, + pub(crate) pci_config_mmio: Arc>, + pub(crate) mmio_config_address: u64, + pub(crate) proximity_domain: u32, + + #[cfg(target_arch = "x86_64")] + pub(crate) pci_config_io: Option>>, + + // Bitmap of PCI devices to hotplug. + pub(crate) pci_devices_up: u32, + // Bitmap of PCI devices to hotunplug. + pub(crate) pci_devices_down: u32, + // List of allocated IRQs for each PCI slot. + pub(crate) pci_irq_slots: [u8; 32], + + // Device memory covered by this segment + pub(crate) start_of_mem32_area: u64, + pub(crate) end_of_mem32_area: u64, + + pub(crate) start_of_mem64_area: u64, + pub(crate) end_of_mem64_area: u64, + + pub(crate) mem32_allocator: Arc>, + pub(crate) mem64_allocator: Arc>, +} + +impl Debug for PciSegment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciSegment") + .field("id", &self.id) + .finish() + } +} + +struct DummyDeviceRelocation; +impl DeviceRelocation for DummyDeviceRelocation { + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + _pci_dev: &mut dyn PciDevice, + _region_type: PciBarRegionType, + ) -> std::result::Result<(), io::Error> { + error!( + "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", + old_base, new_base, len + ); + Ok(()) + } +} + +impl PciSegment { + pub(crate) fn new( + id: u16, + numa_node: u32, + mem32_allocator: Arc>, + mem64_allocator: Arc>, + mmio_bus: &mut Bus, + pci_irq_slots: &[u8; 32], + ) -> Result { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new( + pci_root, + Arc::new(DummyDeviceRelocation{}) + ))); + + let pci_config_mmio = Arc::new(Mutex::new(BusDevice::MmioPciBus(PciConfigMmio::new(Arc::clone(&pci_bus))))); + let mmio_config_address = + layout::PCI_MMCONFIG_START + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + mmio_bus + .insert( + pci_config_mmio.clone(), + mmio_config_address, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + ) + .map_err(|e| anyhow!("error adding pci bus to mmio bus {e}"))?; + + let start_of_mem32_area = mem32_allocator.lock().unwrap().base().0; + let end_of_mem32_area = mem32_allocator.lock().unwrap().end().0; + + let start_of_mem64_area = mem64_allocator.lock().unwrap().base().0; + let end_of_mem64_area = mem64_allocator.lock().unwrap().end().0; + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: numa_node, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + mem32_allocator, + mem64_allocator, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + info!( + "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}, mem64 area [0x{:x}-0x{:x}", + segment.id, segment.mmio_config_address, segment.start_of_mem32_area, segment.end_of_mem32_area, segment.start_of_mem64_area, segment.end_of_mem64_area + ); + Ok(segment) + } + + #[cfg(target_arch = "x86_64")] + pub(crate) fn new_default_segment( + mem32_allocator: Arc>, + mem64_allocator: Arc>, + mmio_bus: &mut Bus, + io_bus: &mut Bus, + pci_irq_slots: &[u8; 32], + ) -> Result { + let mut segment = Self::new( + 0, + 0, + mem32_allocator, + mem64_allocator, + mmio_bus, + pci_irq_slots, + )?; + let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&segment.pci_bus))))); + + io_bus + .insert( + pci_config_io.clone(), + PCI_CONFIG_IO_PORT, + PCI_CONFIG_IO_PORT_SIZE, + ) + .map_err(|e| anyhow!("error adding pci bus to pio bus {e}"))?; + + segment.pci_config_io = Some(pci_config_io); + + Ok(segment) + } + + #[cfg(target_arch = "aarch64")] + pub(crate) fn new_default_segment( + address_manager: &Arc, + mem32_allocator: Arc>, + mem64_allocator: Arc>, + pci_irq_slots: &[u8; 32], + ) -> DeviceManagerResult { + Self::new( + 0, + 0, + address_manager, + mem32_allocator, + mem64_allocator, + pci_irq_slots, + ) + } + + pub(crate) fn next_device_bdf(&self) -> Result { + Ok(PciBdf::new( + self.id, + 0, + self.pci_bus + .lock() + .unwrap() + .next_device_id() + .map_err(|_e| anyhow!("error adding getting device id"))? as u8, + 0, + )) + } + + pub fn reserve_legacy_interrupts_for_pci_devices( + allocator: &Arc>, + pci_irq_slots: &mut [u8; 32], + ) -> Result<()> { + // Reserve 8 IRQs which will be shared across all PCI devices. + let num_irqs = 8; + let mut irqs: Vec = Vec::new(); + for _ in 0..num_irqs { + irqs.push( + allocator + .lock() + .unwrap() + .allocate_irq() + .ok_or(anyhow!("error allocating irq"))? as u8, + ); + } + + // There are 32 devices on the PCI bus, let's assign them an IRQ. + for i in 0..32 { + pci_irq_slots[i] = irqs[i % num_irqs]; + } + + Ok(()) + } +} + +struct PciDevSlot { + device_id: u8, +} + +impl Aml for PciDevSlot { + fn append_aml_bytes(&self, v: &mut Vec) { + let sun = self.device_id; + let adr: u32 = (self.device_id as u32) << 16; + aml::Device::new( + format!("S{:03}", self.device_id).as_str().into(), + vec![ + &aml::Name::new("_SUN".into(), &sun), + &aml::Name::new("_ADR".into(), &adr), + &aml::Method::new( + "_EJ0".into(), + 1, + true, + vec![&aml::MethodCall::new( + "\\_SB_.PHPR.PCEJ".into(), + vec![&aml::Path::new("_SUN"), &aml::Path::new("_SEG")], + )], + ), + ], + ) + .append_aml_bytes(v) + } +} + +struct PciDevSlotNotify { + device_id: u8, +} + +impl Aml for PciDevSlotNotify { + fn append_aml_bytes(&self, v: &mut Vec) { + let device_id_mask: u32 = 1 << self.device_id; + let object = aml::Path::new(&format!("S{:03}", self.device_id)); + aml::And::new(&aml::Local(0), &aml::Arg(0), &device_id_mask).append_aml_bytes(v); + aml::If::new( + &aml::Equal::new(&aml::Local(0), &device_id_mask), + vec![&aml::Notify::new(&object, &aml::Arg(1))], + ) + .append_aml_bytes(v); + } +} + +struct PciDevSlotMethods {} + +impl Aml for PciDevSlotMethods { + fn append_aml_bytes(&self, v: &mut Vec) { + let mut device_notifies = Vec::new(); + for device_id in 0..32 { + device_notifies.push(PciDevSlotNotify { device_id }); + } + + let mut device_notifies_refs: Vec<&dyn Aml> = Vec::new(); + for device_notify in device_notifies.iter() { + device_notifies_refs.push(device_notify); + } + + aml::Method::new("DVNT".into(), 2, true, device_notifies_refs).append_aml_bytes(v); + aml::Method::new( + "PCNT".into(), + 0, + true, + vec![ + &aml::Acquire::new("\\_SB_.PHPR.BLCK".into(), 0xffff), + &aml::Store::new(&aml::Path::new("\\_SB_.PHPR.PSEG"), &aml::Path::new("_SEG")), + &aml::MethodCall::new( + "DVNT".into(), + vec![&aml::Path::new("\\_SB_.PHPR.PCIU"), &aml::ONE], + ), + &aml::MethodCall::new( + "DVNT".into(), + vec![&aml::Path::new("\\_SB_.PHPR.PCID"), &3usize], + ), + &aml::Release::new("\\_SB_.PHPR.BLCK".into()), + ], + ) + .append_aml_bytes(v) + } +} + +struct PciDsmMethod {} + +impl Aml for PciDsmMethod { + fn append_aml_bytes(&self, v: &mut Vec) { + // Refer to ACPI spec v6.3 Ch 9.1.1 and PCI Firmware spec v3.3 Ch 4.6.1 + // _DSM (Device Specific Method), the following is the implementation in ASL. + /* + Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method + { + If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling Interface */)) + { + If ((Arg2 == Zero)) + { + Return (Buffer (One) { 0x21 }) + } + If ((Arg2 == 0x05)) + { + Return (Zero) + } + } + + Return (Buffer (One) { 0x00 }) + } + */ + /* + * As per ACPI v6.3 Ch 19.6.142, the UUID is required to be in mixed endian: + * Among the fields of a UUID: + * {d1 (8 digits)} - {d2 (4 digits)} - {d3 (4 digits)} - {d4 (16 digits)} + * d1 ~ d3 need to be little endian, d4 be big endian. + * See https://en.wikipedia.org/wiki/Universally_unique_identifier#Encoding . + */ + let uuid = Uuid::parse_str("E5C937D0-3553-4D7A-9117-EA4D19C3434D").unwrap(); + let (uuid_d1, uuid_d2, uuid_d3, uuid_d4) = uuid.as_fields(); + let mut uuid_buf = vec![]; + uuid_buf.extend(uuid_d1.to_le_bytes()); + uuid_buf.extend(uuid_d2.to_le_bytes()); + uuid_buf.extend(uuid_d3.to_le_bytes()); + uuid_buf.extend(uuid_d4); + aml::Method::new( + "_DSM".into(), + 4, + false, + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(0), &aml::Buffer::new(uuid_buf)), + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &aml::ZERO), + vec![&aml::Return::new(&aml::Buffer::new(vec![0x21]))], + ), + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &0x05u8), + vec![&aml::Return::new(&aml::ZERO)], + ), + ], + ), + &aml::Return::new(&aml::Buffer::new(vec![0])), + ], + ) + .append_aml_bytes(v) + } +} + +impl Aml for PciSegment { + fn append_aml_bytes(&self, v: &mut Vec) { + let mut pci_dsdt_inner_data: Vec<&dyn Aml> = Vec::new(); + let hid = aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A08")); + pci_dsdt_inner_data.push(&hid); + let cid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A03")); + pci_dsdt_inner_data.push(&cid); + let adr = aml::Name::new("_ADR".into(), &aml::ZERO); + pci_dsdt_inner_data.push(&adr); + let seg = aml::Name::new("_SEG".into(), &self.id); + pci_dsdt_inner_data.push(&seg); + let uid = aml::Name::new("_UID".into(), &aml::ZERO); + pci_dsdt_inner_data.push(&uid); + let cca = aml::Name::new("_CCA".into(), &aml::ONE); + pci_dsdt_inner_data.push(&cca); + let supp = aml::Name::new("SUPP".into(), &aml::ZERO); + pci_dsdt_inner_data.push(&supp); + + let proximity_domain = self.proximity_domain; + let pxm_return = aml::Return::new(&proximity_domain); + let pxm = aml::Method::new("_PXM".into(), 0, false, vec![&pxm_return]); + pci_dsdt_inner_data.push(&pxm); + + let pci_dsm = PciDsmMethod {}; + pci_dsdt_inner_data.push(&pci_dsm); + + #[allow(clippy::if_same_then_else)] + let crs = if self.id == 0 { + aml::Name::new( + "_CRS".into(), + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16), + #[cfg(target_arch = "x86_64")] + &aml::Io::new(0xcf8, 0xcf8, 1, 0x8), + &aml::Memory32Fixed::new( + true, + self.mmio_config_address as u32, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT as u32, + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCachable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCachable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + ), + #[cfg(target_arch = "x86_64")] + &aml::AddressSpace::new_io(0u16, 0x0cf7u16), + #[cfg(target_arch = "x86_64")] + &aml::AddressSpace::new_io(0x0d00u16, 0xffffu16), + ]), + ) + } else { + aml::Name::new( + "_CRS".into(), + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16), + &aml::Memory32Fixed::new( + true, + self.mmio_config_address as u32, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT as u32, + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCachable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCachable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + ), + ]), + ) + }; + pci_dsdt_inner_data.push(&crs); + + let mut pci_devices = Vec::new(); + for device_id in 0..32 { + let pci_device = PciDevSlot { device_id }; + pci_devices.push(pci_device); + } + for pci_device in pci_devices.iter() { + pci_dsdt_inner_data.push(pci_device); + } + + let pci_device_methods = PciDevSlotMethods {}; + pci_dsdt_inner_data.push(&pci_device_methods); + + // Build PCI routing table, listing IRQs assigned to PCI devices. + let prt_package_list: Vec<(u32, u32)> = self + .pci_irq_slots + .iter() + .enumerate() + .map(|(i, irq)| (((((i as u32) & 0x1fu32) << 16) | 0xffffu32), *irq as u32)) + .collect(); + let prt_package_list: Vec = prt_package_list + .iter() + .map(|(bdf, irq)| aml::Package::new(vec![bdf, &0u8, &0u8, irq])) + .collect(); + let prt_package_list: Vec<&dyn Aml> = prt_package_list + .iter() + .map(|item| item as &dyn Aml) + .collect(); + let prt = aml::Name::new("_PRT".into(), &aml::Package::new(prt_package_list)); + pci_dsdt_inner_data.push(&prt); + + aml::Device::new( + format!("_SB_.PC{:02X}", self.id).as_str().into(), + pci_dsdt_inner_data, + ) + .append_aml_bytes(v) + } +} diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 44e428fee80..3bb0f836a97 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -123,6 +123,7 @@ use std::time::Duration; use device_manager::acpi::ACPIDeviceManager; use device_manager::resources::ResourceAllocator; use devices::acpi::vmgenid::VmGenIdError; +use devices::pci_segment::PciSegment; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccompiler::BpfProgram; use userfaultfd::Uffd; @@ -327,6 +328,7 @@ pub struct Vmm { #[cfg(target_arch = "x86_64")] pio_device_manager: PortIODeviceManager, acpi_device_manager: ACPIDeviceManager, + pci_segment: PciSegment, } impl Vmm { From 615e9d9c601b9514ae90f9155b47a9c059cb1ee6 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 22 Oct 2024 11:00:03 +0100 Subject: [PATCH 09/22] write PCI segment information into DSDT table Signed-off-by: Riccardo Mancini --- src/vmm/src/acpi/mod.rs | 7 ++++++- src/vmm/src/builder.rs | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 102dbdc53f3..dc47a28141b 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -12,6 +12,7 @@ use crate::acpi::x86_64::{ use crate::device_manager::acpi::ACPIDeviceManager; use crate::device_manager::mmio::MMIODeviceManager; use crate::device_manager::resources::ResourceAllocator; +use crate::devices::pci_segment::PciSegment; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; use crate::Vcpu; @@ -78,6 +79,7 @@ impl<'a> AcpiTableWriter<'a> { fn build_dsdt( &mut self, mmio_device_manager: &MMIODeviceManager, + pci_segment: &PciSegment, acpi_device_manager: &ACPIDeviceManager, ) -> Result { let mut dsdt_data = Vec::new(); @@ -91,6 +93,8 @@ impl<'a> AcpiTableWriter<'a> { // Architecture specific DSDT data setup_arch_dsdt(&mut dsdt_data); + pci_segment.append_aml_bytes(&mut dsdt_data); + let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); self.write_acpi_table(&mut dsdt) } @@ -177,6 +181,7 @@ pub(crate) fn create_acpi_tables( resource_allocator: &mut ResourceAllocator, mmio_device_manager: &MMIODeviceManager, acpi_device_manager: &ACPIDeviceManager, + pci_segment: &PciSegment, pci_mmio_config_addr: u64, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { @@ -185,7 +190,7 @@ pub(crate) fn create_acpi_tables( resource_allocator, }; - let dsdt_addr = writer.build_dsdt(mmio_device_manager, acpi_device_manager)?; + let dsdt_addr = writer.build_dsdt(mmio_device_manager, pci_segment, acpi_device_manager)?; let fadt_addr = writer.build_fadt(dsdt_addr)?; let madt_addr = writer.build_madt(vcpus.len().try_into().unwrap())?; let mcfg_addr = writer.build_mcfg(pci_mmio_config_addr)?; diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 240e96b29c8..f66c88c31a5 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -1116,6 +1116,7 @@ pub fn configure_system_for_boot( &mut vmm.resource_allocator, &vmm.mmio_device_manager, &vmm.acpi_device_manager, + &vmm.pci_segment, PCI_MMCONFIG_START, vcpus, )?; From 90215c71f4f45f7e81d9f4b8a359fa7be1ad974d Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 22 Oct 2024 11:06:52 +0100 Subject: [PATCH 10/22] remove IA-PC flags to fix missing MSI support in guest I don't think we need the other flags either. Signed-off-by: Riccardo Mancini --- src/vmm/src/acpi/x86_64.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/vmm/src/acpi/x86_64.rs b/src/vmm/src/acpi/x86_64.rs index 2f06c35264f..a328251d76b 100644 --- a/src/vmm/src/acpi/x86_64.rs +++ b/src/vmm/src/acpi/x86_64.rs @@ -33,11 +33,11 @@ pub(crate) fn setup_arch_fadt(fadt: &mut Fadt) { // neither do we support ASPM, or MSI type of interrupts. // More info here: // https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html?highlight=0a06#ia-pc-boot-architecture-flags - fadt.setup_iapc_flags( - 1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT - | 1 << IAPC_BOOT_ARG_FLAGS_PCI_ASPM - | 1 << IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT, - ); + // fadt.setup_iapc_flags( + // 1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT + // | 1 << IAPC_BOOT_ARG_FLAGS_PCI_ASPM + // | 1 << IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT, + // ); } #[inline(always)] From 493ec5995db47cabf151d87e62465978f4e803ec Mon Sep 17 00:00:00 2001 From: Andrei Sandu Date: Wed, 30 Jun 2021 16:43:07 +0300 Subject: [PATCH 11/22] Enable MTRR in boot msrs The NVIDIA GPU driver won't work if CPUID does not present it. Signed-off-by: Andrei Sandu --- src/vmm/src/arch/x86_64/msr.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/vmm/src/arch/x86_64/msr.rs b/src/vmm/src/arch/x86_64/msr.rs index 325d6ed6b29..9e5a358dc5c 100644 --- a/src/vmm/src/arch/x86_64/msr.rs +++ b/src/vmm/src/arch/x86_64/msr.rs @@ -394,6 +394,11 @@ pub fn get_msrs_to_dump(kvm_fd: &Kvm) -> Result { Ok(msr_index_list) } +/// IA32_MTRR_DEF_TYPE MSR: E (MTRRs enabled) flag, bit 11 +pub const MTRR_ENABLE: u64 = 0x800; +/// Default - writeback memory +pub const MTRR_MEM_TYPE_WB: u64 = 0x6; + /// Creates and populates required MSR entries for booting Linux on X86_64. pub fn create_boot_msr_entries() -> Vec { let msr_entry_default = |msr| kvm_msr_entry { @@ -419,6 +424,11 @@ pub fn create_boot_msr_entries() -> Vec { data: u64::from(MSR_IA32_MISC_ENABLE_FAST_STRING), ..Default::default() }, + kvm_msr_entry { + index: MSR_MTRRdefType, + data: u64::from(MTRR_ENABLE | MTRR_MEM_TYPE_WB), + ..Default::default() + }, ] } From 41e68b634d73967ba2c6d189bd84d1825c5e69b5 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 14:08:07 +0100 Subject: [PATCH 12/22] virtio-pci: preparation: implement debug for some types These types will get into types that derive Debug so we need to implement it. These are just placeholders to make the compilator happy, we'd need a better way to implement Debug for these types. Signed-off-by: Riccardo Mancini --- src/vm-device/src/interrupt/mod.rs | 8 +++++++- src/vm-system-allocator/src/system.rs | 9 +++++++++ src/vmm/src/interrupt.rs | 9 +++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs index 7bdf7940a47..1f845b80197 100644 --- a/src/vm-device/src/interrupt/mod.rs +++ b/src/vm-device/src/interrupt/mod.rs @@ -57,7 +57,7 @@ //! * The virtual device backend requests the interrupt manager to create an interrupt group //! according to guest configuration information -use std::sync::Arc; +use std::{fmt::{Debug, Formatter}, sync::Arc}; use vmm_sys_util::eventfd::EventFd; /// Reuse std::io::Result to simplify interoperability among crates. @@ -147,6 +147,12 @@ pub trait InterruptManager: Send + Sync { fn destroy_group(&self, group: Arc) -> Result<()>; } +impl Debug for dyn InterruptManager { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "MsiIrqInterruptManager") + } +} + pub trait InterruptSourceGroup: Send + Sync { /// Enable the interrupt sources in the group to generate interrupts. fn enable(&self) -> Result<()> { diff --git a/src/vm-system-allocator/src/system.rs b/src/vm-system-allocator/src/system.rs index e4031bfa22a..f709ae101ad 100644 --- a/src/vm-system-allocator/src/system.rs +++ b/src/vm-system-allocator/src/system.rs @@ -7,6 +7,8 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +use std::fmt::{Debug, Formatter}; + use vm_memory::{GuestAddress, GuestUsize}; use crate::address::AddressAllocator; @@ -48,6 +50,13 @@ pub struct SystemAllocator { gsi_allocator: GsiAllocator, } +impl Debug for SystemAllocator { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("SystemAllocator") + .finish() + } +} + impl SystemAllocator { /// Creates a new `SystemAllocator` for managing addresses and irq numbers. /// Can return `None` if `base` + `size` overflows a u64 diff --git a/src/vmm/src/interrupt.rs b/src/vmm/src/interrupt.rs index 6218b27a9ff..e8b1e80516e 100644 --- a/src/vmm/src/interrupt.rs +++ b/src/vmm/src/interrupt.rs @@ -4,6 +4,7 @@ // use std::collections::HashMap; +use std::fmt::{Debug, Formatter}; use std::io; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; @@ -242,6 +243,14 @@ pub struct MsiInterruptManager { gsi_msi_routes: Arc>>>, } +impl Debug for MsiInterruptManager { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + // TODO + f.debug_struct("MsiInterruptManager") + .finish() + } +} + impl MsiInterruptManager { pub fn new(allocator: Arc>, vm: Arc>) -> Self { // Create a shared list of GSI that can be shared through all PCI From c077f0e2071c047bf77a739485f7c18b5d1e6ab3 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 14:12:55 +0100 Subject: [PATCH 13/22] virtio-pci: preparation: todo do not store irq state This will probably break snapshot resumption for now, but I didn't want to implement how to restore irq state when introducing the new VirtioInterrupt type. Signed-off-by: Riccardo Mancini --- src/vmm/src/devices/virtio/balloon/persist.rs | 4 ++-- src/vmm/src/devices/virtio/block/virtio/persist.rs | 2 +- src/vmm/src/devices/virtio/net/persist.rs | 2 +- src/vmm/src/devices/virtio/persist.rs | 4 ++-- src/vmm/src/devices/virtio/rng/persist.rs | 2 +- src/vmm/src/devices/virtio/vsock/persist.rs | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 4e768ddd2e2..1f051262f01 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -138,8 +138,8 @@ impl Persist<'_> for Balloon { FIRECRACKER_MAX_QUEUE_SIZE, ) .map_err(|_| Self::Error::QueueRestoreError)?; - balloon.irq_trigger.irq_status = - Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); + // balloon.irq_trigger.irq_status = + // Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); balloon.avail_features = state.virtio_state.avail_features; balloon.acked_features = state.virtio_state.acked_features; balloon.latest_stats = state.latest_stats.create_stats(); diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 61bffbeaa40..5955b18a0e0 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -131,7 +131,7 @@ impl Persist<'_> for VirtioBlock { queues, queue_evts, device_state, - irq_trigger, + virtio_interrupt: Some(Arc::new(irq_trigger)), id: state.id.clone(), partuuid: state.partuuid.clone(), diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 51394af2d4e..4e4439c21e7 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -149,7 +149,7 @@ impl Persist<'_> for Net { NET_NUM_QUEUES, FIRECRACKER_MAX_QUEUE_SIZE, )?; - net.irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); + // net.irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 23293a25eab..27fd73670a0 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize}; use super::queue::QueueError; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX; -use crate::devices::virtio::mmio::MmioTransport; +use crate::devices::virtio::transport::MmioTransport; use crate::devices::virtio::queue::Queue; use crate::snapshot::Persist; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -261,7 +261,7 @@ mod tests { use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::test_utils::default_block_with_path; use crate::devices::virtio::block::virtio::VirtioBlock; - use crate::devices::virtio::mmio::tests::DummyDevice; + use crate::devices::virtio::transport::mmio::tests::DummyDevice; use crate::devices::virtio::net::test_utils::default_net; use crate::devices::virtio::net::Net; use crate::devices::virtio::test_utils::default_mem; diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 4aa9e449344..7f6f0b3e91d 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -66,7 +66,7 @@ impl Persist<'_> for Entropy { let mut entropy = Entropy::new_with_queues(queues, rate_limiter)?; entropy.set_avail_features(state.virtio_state.avail_features); entropy.set_acked_features(state.virtio_state.acked_features); - entropy.set_irq_status(state.virtio_state.interrupt_status); + // entropy.set_irq_status(state.virtio_state.interrupt_status); if state.virtio_state.activated { entropy.set_activated(constructor_args.0); } diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index dce545fd68d..2b8a3109df7 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -121,8 +121,8 @@ where vsock.acked_features = state.virtio_state.acked_features; vsock.avail_features = state.virtio_state.avail_features; - vsock.irq_trigger.irq_status = - Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); + // vsock.irq_trigger.irq_status = + // Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); vsock.device_state = if state.virtio_state.activated { DeviceState::Activated(constructor_args.mem) } else { From 172c701e36e5cf4822838b82113bd1210430d9fe Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 14:16:18 +0100 Subject: [PATCH 14/22] virtio-pci: preparation: move mmio into transport crate This crate will host the new VirtioPciDevice as well. Signed-off-by: Riccardo Mancini --- src/vmm/src/builder.rs | 2 +- src/vmm/src/device_manager/mmio.rs | 2 +- src/vmm/src/device_manager/persist.rs | 2 +- src/vmm/src/devices/virtio/device.rs | 2 +- src/vmm/src/devices/virtio/mod.rs | 2 +- .../devices/virtio/{ => transport}/mmio.rs | 26 ++++++++++--------- src/vmm/src/devices/virtio/transport/mod.rs | 11 ++++++++ 7 files changed, 30 insertions(+), 17 deletions(-) rename src/vmm/src/devices/virtio/{ => transport}/mmio.rs (97%) create mode 100644 src/vmm/src/devices/virtio/transport/mod.rs diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 6fa5d289566..24bbdc496e6 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -63,7 +63,7 @@ use pci::{PciBus, PciConfigIo, PciConfigMmio, PciRoot}; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; +use crate::devices::virtio::transport::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 9a1f2cd505c..4c6d0e06b0e 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -30,7 +30,7 @@ use crate::devices::pseudo::BootTimer; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; +use crate::devices::virtio::transport::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend, TYPE_VSOCK}; diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 5773fa0ba09..134a1d3a5ae 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -24,7 +24,7 @@ use crate::devices::virtio::block::device::Block; use crate::devices::virtio::block::persist::{BlockConstructorArgs, BlockState}; use crate::devices::virtio::block::BlockError; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; +use crate::devices::virtio::transport::MmioTransport; use crate::devices::virtio::net::persist::{ NetConstructorArgs, NetPersistError as NetError, NetState, }; diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index b5af6862af9..50f7adb79b9 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -11,7 +11,7 @@ use std::sync::Arc; use vmm_sys_util::eventfd::EventFd; -use super::mmio::{VIRTIO_MMIO_INT_CONFIG, VIRTIO_MMIO_INT_VRING}; +use super::transport::mmio::{VIRTIO_MMIO_INT_CONFIG, VIRTIO_MMIO_INT_VRING}; use super::queue::{Queue, QueueError}; use super::ActivateError; use crate::devices::virtio::AsAny; diff --git a/src/vmm/src/devices/virtio/mod.rs b/src/vmm/src/devices/virtio/mod.rs index 9931e1211d1..1f5bed67a7f 100644 --- a/src/vmm/src/devices/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/mod.rs @@ -18,12 +18,12 @@ pub mod device; pub mod gen; mod iov_deque; pub mod iovec; -pub mod mmio; pub mod net; pub mod persist; pub mod queue; pub mod rng; pub mod test_utils; +pub mod transport; pub mod vhost_user; pub mod vhost_user_metrics; pub mod vsock; diff --git a/src/vmm/src/devices/virtio/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs similarity index 97% rename from src/vmm/src/devices/virtio/mmio.rs rename to src/vmm/src/devices/virtio/transport/mmio.rs index 463d11ca2e2..58c148a9cc1 100644 --- a/src/vmm/src/devices/virtio/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -9,7 +9,7 @@ use std::fmt::Debug; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, MutexGuard}; -use crate::devices::virtio::device::{IrqType, VirtioDevice}; +use crate::devices::virtio::device::{IrqType, VirtioDevice, VirtioInterruptType}; use crate::devices::virtio::device_status; use crate::devices::virtio::queue::Queue; use crate::logger::{error, warn}; @@ -187,7 +187,7 @@ impl MmioTransport { let device_activated = self.locked_device().is_activated(); if !device_activated && self.are_queues_valid() { // temporary variable needed for borrow checker - let activate_result = self.locked_device().activate(self.mem.clone()); + let activate_result = self.locked_device().activate(self.mem.clone(), None); if let Err(err) = activate_result { self.device_status |= DEVICE_NEEDS_RESET; @@ -195,8 +195,8 @@ impl MmioTransport { // configuration change interrupt let _ = self .locked_device() - .interrupt_trigger() - .trigger_irq(IrqType::Config); + .interrupt() + .trigger(VirtioInterruptType::Config); error!("Failed to activate virtio device: {}", err) } @@ -373,10 +373,11 @@ impl MmioTransport { #[cfg(test)] pub(crate) mod tests { + use aes_gcm::aes::cipher::inout::IntoArrayError; use vmm_sys_util::eventfd::EventFd; use super::*; - use crate::devices::virtio::device::IrqTrigger; + use crate::devices::virtio::device::{IrqTrigger, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::device_status::DEVICE_NEEDS_RESET; use crate::devices::virtio::ActivateError; use crate::test_utils::single_region_mem; @@ -388,7 +389,7 @@ pub(crate) mod tests { pub(crate) struct DummyDevice { acked_features: u64, avail_features: u64, - interrupt_trigger: IrqTrigger, + virtio_interrupt: Arc, queue_evts: Vec, queues: Vec, device_activated: bool, @@ -401,7 +402,7 @@ pub(crate) mod tests { DummyDevice { acked_features: 0, avail_features: 0, - interrupt_trigger: IrqTrigger::new().unwrap(), + virtio_interrupt: Arc::new(IrqTrigger::new().unwrap()), queue_evts: vec![ EventFd::new(libc::EFD_NONBLOCK).unwrap(), EventFd::new(libc::EFD_NONBLOCK).unwrap(), @@ -447,8 +448,8 @@ pub(crate) mod tests { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.interrupt_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.clone() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -461,7 +462,7 @@ pub(crate) mod tests { } } - fn activate(&mut self, _: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, _: GuestMemoryMmap, _: Option>) -> Result<(), ActivateError> { self.device_activated = true; if self.activate_should_error { Err(ActivateError::EventFd) @@ -892,8 +893,9 @@ pub(crate) mod tests { // We actually wrote to the eventfd assert_eq!( d.locked_device() - .interrupt_trigger() - .irq_evt + .interrupt() + .notifier(VirtioInterruptType::Config) + .unwrap() .read() .unwrap(), 1 diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs new file mode 100644 index 00000000000..555a4a45a64 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -0,0 +1,11 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 + +use vmm_sys_util::eventfd::EventFd; +// mod pci_common_config; +// mod pci_device; +pub(crate) mod mmio; +pub use mmio::MmioTransport; +pub use pci_common_config::{VirtioPciCommonConfig, VIRTIO_PCI_COMMON_CONFIG_ID}; +pub use pci_device::{VirtioPciDevice, VirtioPciDeviceError}; From b8b82f01e7772a533834624a8113e633c07d5a7f Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 14:19:04 +0100 Subject: [PATCH 15/22] virtio-pci: preparation: prepare bus for device relocation When relocating a BAR we need to remove and re-insert a bus device into a different address range. Signed-off-by: Riccardo Mancini --- src/vmm/src/device_manager/legacy.rs | 16 +++++---- src/vmm/src/device_manager/mmio.rs | 4 +-- src/vmm/src/devices/bus.rs | 51 +++++++++++++++++++++++----- 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 7130eb44b1f..77742e8bb75 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -38,7 +38,7 @@ pub struct PortIODeviceManager { pub stdio_serial: Arc>, // BusDevice::I8042Device pub i8042: Arc>, - pub pci_bus: Arc>, + pub pci_bus: Option>>, // Communication event on ports 1 & 3. pub com_evt_1_3: EventFdTrigger, @@ -75,7 +75,6 @@ impl PortIODeviceManager { pub fn new( serial: Arc>, i8042_reset_evfd: EventFd, - pci_bus: Arc>, ) -> Result { debug_assert!(matches!(*serial.lock().unwrap(), BusDevice::Serial(_))); let io_bus = crate::devices::Bus::new(); @@ -98,13 +97,17 @@ impl PortIODeviceManager { io_bus, stdio_serial: serial, i8042, - pci_bus, + pci_bus: None, com_evt_1_3, com_evt_2_4, kbd_evt, }) } + pub fn put_pci_bus(&mut self, pci_bus: Arc>) { + self.pci_bus = Some(pci_bus); + } + /// Register supported legacy devices. pub fn register_devices(&mut self, vm_fd: &VmFd) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { @@ -127,11 +130,13 @@ impl PortIODeviceManager { ), input: None, }))); - self.io_bus.insert( - self.pci_bus.clone(), + if let Some(ref pci_bus) = self.pci_bus { + self.io_bus.insert( + pci_bus.clone(), 0xcf8, 0x8 )?; + } self.io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], @@ -268,7 +273,6 @@ mod tests { input: None, }))), EventFd::new(libc::EFD_NONBLOCK).unwrap(), - Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice{}))) ) .unwrap(); ldm.register_devices(vm.fd()).unwrap(); diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 4c6d0e06b0e..0c94f789610 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -374,7 +374,7 @@ impl MMIODeviceManager { &self, device_type: DeviceType, device_id: &str, - ) -> Option<&Mutex> { + ) -> Option>> { if let Some(device_info) = self .id_to_dev_info .get(&(device_type, device_id.to_string())) @@ -389,7 +389,7 @@ impl MMIODeviceManager { /// Run fn for each registered device. pub fn for_each_device(&self, mut f: F) -> Result<(), E> where - F: FnMut(&DeviceType, &String, &MMIODeviceInfo, &Mutex) -> Result<(), E>, + F: FnMut(&DeviceType, &String, &MMIODeviceInfo, Arc>) -> Result<(), E>, { for ((device_type, device_id), device_info) in self.get_device_info().iter() { let bus_device = self diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index caf3c174ba1..e8769b7d0a0 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -11,13 +11,15 @@ use std::any::Any; use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; use std::collections::btree_map::BTreeMap; use std::result::Result; -use std::sync::{Arc, Barrier, Mutex}; +use std::sync::{Arc, Barrier, Mutex, RwLock}; /// Errors triggered during bus operations. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum BusError { /// The insertion failed because the new device overlapped with an old device. Overlap, + /// The relocation failed because no device was mapped at the address + MissingAddressRange, } #[derive(Debug, Copy, Clone)] @@ -49,7 +51,7 @@ impl PartialOrd for BusRange { /// only restriction is that no two devices can overlap in this address space. #[derive(Debug, Clone, Default)] pub struct Bus { - devices: BTreeMap>>, + devices: Arc>>>>, } use event_manager::{EventOps, Events, MutEventSubscriber}; @@ -364,21 +366,21 @@ impl Bus { /// Constructs an a bus with an empty address space. pub fn new() -> Bus { Bus { - devices: BTreeMap::new(), + devices: Arc::new(RwLock::new(BTreeMap::new())), } } - fn first_before(&self, addr: u64) -> Option<(BusRange, &Mutex)> { + fn first_before(&self, addr: u64) -> Option<(BusRange, Arc>)> { // for when we switch to rustc 1.17: self.devices.range(..addr).iter().rev().next() - for (range, dev) in self.devices.iter().rev() { + for (range, dev) in self.devices.read().unwrap().iter().rev() { if range.0 <= addr { - return Some((*range, dev)); + return Some((*range, dev.clone())); } } None } - pub fn get_device(&self, addr: u64) -> Option<(u64, u64, &Mutex)> { + pub fn get_device(&self, addr: u64) -> Option<(u64, u64, Arc>)> { if let Some((BusRange(start, len), dev)) = self.first_before(addr) { let offset = addr - start; if offset < len { @@ -390,7 +392,7 @@ impl Bus { /// Puts the given device at the given address space. pub fn insert( - &mut self, + &self, device: Arc>, base: u64, len: u64, @@ -416,13 +418,21 @@ impl Bus { } } - if self.devices.insert(BusRange(base, len), device).is_some() { + if self.devices.write().unwrap().insert(BusRange(base, len), device).is_some() { return Err(BusError::Overlap); } Ok(()) } + pub fn remove(&self, base: u64, len: u64) -> Result<(), BusError> { + let range = BusRange(base, len); + if self.devices.write().unwrap().remove(&range).is_none() { + return Err(BusError::MissingAddressRange); + } + Ok(()) + } + /// Reads data from the device that owns the range containing `addr` and puts it into `data`. /// /// Returns true on success, otherwise `data` is untouched. @@ -452,6 +462,29 @@ impl Bus { false } } + + /// Updates the address range for an existing device. + pub fn update_range( + &self, + old_base: u64, + old_len: u64, + new_base: u64, + new_len: u64, + ) -> Result<(), BusError> { + // Retrieve the device corresponding to the range + let device = if let Some((_, _, dev)) = self.get_device(old_base) { + dev.clone() + } else { + return Err(BusError::MissingAddressRange); + }; + + // Remove the old address range + self.remove(old_base, old_len)?; + + // Insert the new address range + self.insert(device, new_base, new_len) + } + } #[cfg(test)] From e9129c0ef497cb3ac35589a43f7d9b087bd66628 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 14:23:35 +0100 Subject: [PATCH 16/22] virtio-pci: implement device relocation I've introduced a new struct AddressManager, but it's not yet integrated with Vmm. It should probably be added as a member of the Vmm struct. Signed-off-by: Riccardo Mancini --- src/vmm/src/builder.rs | 165 ++++++++++++++++++----------- src/vmm/src/devices/pci_segment.rs | 115 +++++++++----------- src/vmm/src/lib.rs | 146 ++++++++++++++++++++++++- 3 files changed, 298 insertions(+), 128 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 24bbdc496e6..e3142836b37 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -15,7 +15,7 @@ use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; use kvm_bindings::{kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO}; -use kvm_ioctls::{DeviceFd, VmFd}; +use kvm_ioctls::{DeviceFd, IoEventAddress, NoDatamatch, VmFd}; use libc::EFD_NONBLOCK; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; #[cfg(target_arch = "x86_64")] @@ -23,7 +23,7 @@ use linux_loader::loader::elf::Elf as Loader; #[cfg(target_arch = "aarch64")] use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::KernelLoader; -use pci::{DeviceRelocation, PciBarConfiguration, PciBarRegionType, PciDevice, VfioPciDevice}; +use pci::{DeviceRelocation, PciBarConfiguration, PciBarRegionType, PciBdf, PciDevice, VfioPciDevice}; use seccompiler::BpfThreadMap; use userfaultfd::Uffd; use utils::time::TimestampUs; @@ -67,7 +67,7 @@ use crate::devices::virtio::transport::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; -use crate::devices::{Bus, BusDevice}; +use crate::devices::{virtio, Bus, BusDevice}; #[cfg(feature = "gdb")] use crate::gdb; use crate::interrupt::MsiInterruptManager; @@ -82,7 +82,7 @@ use crate::vmm_config::machine_config::{VmConfig, VmConfigError}; use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuError}; use crate::vstate::vm::Vm; -use crate::{device_manager, EventManager, Vmm, VmmError}; +use crate::{device_manager, AddressManager, EventManager, Vmm, VmmError}; /// Errors associated with starting the instance. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -149,6 +149,8 @@ pub enum StartMicrovmError { /// Error cloning Vcpu fds #[cfg(feature = "gdb")] VcpuFdCloneError(#[from] crate::vstate::vcpu::CopyKvmFdError), + /// TODO + Unknown, } /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -195,6 +197,49 @@ fn register_pci_device_mapping( Ok(()) } +fn add_pci_device( + bus_device: Arc>, + pci_segment: &PciSegment, + dev_manager: &mut MMIODeviceManager, + pio_manager: &mut PortIODeviceManager, + allocator: Arc>, + bdf: PciBdf, +) -> Result<(), VmmError> { + let bars = bus_device.lock().unwrap().pci_device_mut().unwrap() + .allocate_bars( + &allocator, + &mut pci_segment + .mem32_allocator + .lock() + .unwrap(), + &mut pci_segment + .mem64_allocator + .lock() + .unwrap(), + None, + ) + .map_err(|_| VmmError::Unknown)?; + + let mut pci_bus = pci_segment + .pci_bus + .lock() + .unwrap(); + + pci_bus + .add_device(bdf.device() as u32, bus_device.clone()) + .map_err(|_| VmmError::Unknown)?; + + register_pci_device_mapping( + bus_device, + #[cfg(target_arch = "x86_64")] + &mut pio_manager.io_bus, + &mut dev_manager.bus, + bars.clone() + )?; + + Ok(()) +} + fn add_vfio_device( vm: Arc>, fd: DeviceFd, @@ -267,21 +312,18 @@ fn add_vfio_device( ).unwrap()); let vfio_pci_device = Arc::new(Mutex::new(vfio_pci_device)); - let bars = vfio_pci_device - .lock() - .expect("bad lock") - .vfio_pci_device_mut() - .unwrap() - .allocate_bars( - &allocator, - &mut pci_segment.mem32_allocator.lock().unwrap(), - &mut pci_segment.mem64_allocator.lock().unwrap(), - None, - ) - .unwrap(); + + add_pci_device( + vfio_pci_device.clone(), + pci_segment, + dev_manager, + pio_manager, + allocator.clone(), + pci_device_bdf.into() + ).unwrap(); // Register DMA mapping in IOMMU. - for (index, region) in memory.iter().enumerate() { + for (_index, region) in memory.iter().enumerate() { info!( "Mapping DMA for {:x} len {:x} at hva {:x}", region.start_addr().0, @@ -305,27 +347,6 @@ fn add_vfio_device( // ) } - vfio_pci_device - .lock() - .expect("bad lock") - .vfio_pci_device_mut() - .unwrap() - .map_mmio_regions() - .unwrap(); - - pci_segment.pci_bus.lock() - .expect("bad lock") - .add_device(pci_device_id, vfio_pci_device.clone()) - .unwrap(); - - register_pci_device_mapping( - vfio_pci_device.clone(), - #[cfg(target_arch = "x86_64")] - &mut pio_manager.io_bus, - &mut dev_manager.bus, - bars.clone(), - ).unwrap(); - // Need to register bus mappings ? } @@ -432,14 +453,6 @@ fn create_vmm_and_vcpus( // } let pci_irq_slots: [u8; 32] = [(NUM_IOAPIC_PINS-1) as u8; 32]; - let pci_segment = PciSegment::new( - 0, - 0, - pci_mmio32_allocator, - pci_mmio64_allocator, - &mut mmio_device_manager.bus, - &pci_irq_slots, - ).unwrap(); // For x86_64 we need to create the interrupt controller before calling `KVM_CREATE_VCPUS` // while on aarch64 we need to do it the other way around. @@ -461,22 +474,35 @@ fn create_vmm_and_vcpus( .map_err(VmmError::EventFd) .map_err(Internal)?; - let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&pci_segment.pci_bus))))); - - // create pio dev manager with legacy devices - let pio_device_manager = { - // TODO Remove these unwraps. - let mut pio_dev_mgr = PortIODeviceManager::new(serial_device, reset_evt, pci_config_io).unwrap(); - pio_dev_mgr.register_devices(vm.fd()).unwrap(); - pio_dev_mgr - }; + let pio_dev_mgr = PortIODeviceManager::new(serial_device, reset_evt).unwrap(); - (vcpus, pio_device_manager) + (vcpus, pio_dev_mgr) }; - // Create passthru device for a GPU. - let device_fd = create_passthrough_device(vm.fd()); + let address_manager = Arc::new(AddressManager{ + allocator: allocator.clone(), + io_bus: Arc::new(pio_device_manager.io_bus.clone()), + mmio_bus: Arc::new(mmio_device_manager.bus.clone()), + vm: vm_fd, + pci_mmio32_allocators: vec!(pci_mmio32_allocator.clone()), + pci_mmio64_allocators: vec!(pci_mmio64_allocator.clone()), + }); + + let pci_segment = PciSegment::new( + 0, + 0, + pci_mmio32_allocator, + pci_mmio64_allocator, + &mut mmio_device_manager.bus, + &pci_irq_slots, + address_manager, + ).unwrap(); + + let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&pci_segment.pci_bus))))); + pio_device_manager.put_pci_bus(pci_config_io); + pio_device_manager.register_devices(vm.fd()).unwrap(); + add_vfio_device( Arc::clone(&vm_fd), @@ -1379,6 +1405,24 @@ pub mod tests { .unwrap() } + struct DummyDeviceRelocation; + impl DeviceRelocation for DummyDeviceRelocation { + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + _pci_dev: &mut dyn PciDevice, + _region_type: PciBarRegionType, + ) -> std::result::Result<(), io::Error> { + error!( + "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", + old_base, new_base, len + ); + Ok(()) + } + } + pub(crate) fn default_vmm() -> Vmm { let guest_memory = arch_mem(128 << 20); @@ -1387,11 +1431,10 @@ pub mod tests { .map_err(StartMicrovmError::Internal) .unwrap(); - let (mut vm, _) = Vm::new(vec![]).unwrap(); + let (mut vm, extra_fd) = Vm::new(vec![]).unwrap(); vm.memory_init(&guest_memory, false).unwrap(); let mut mmio_device_manager = MMIODeviceManager::new(); let acpi_device_manager = ACPIDeviceManager::new(); - let pci_bus = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice{}))); #[cfg(target_arch = "x86_64")] let pio_device_manager = PortIODeviceManager::new( Arc::new(Mutex::new(BusDevice::Serial(SerialWrapper { @@ -1405,7 +1448,6 @@ pub mod tests { input: None, }))), EventFd::new(libc::EFD_NONBLOCK).unwrap(), - pci_bus, ) .unwrap(); @@ -1438,6 +1480,7 @@ pub mod tests { pci_mmio64_allocator, &mut mmio_device_manager.bus, &[0u8; 32], + Arc::new(DummyDeviceRelocation{}), ).unwrap(); Vmm { diff --git a/src/vmm/src/devices/pci_segment.rs b/src/vmm/src/devices/pci_segment.rs index dc8fd9ed8c0..b8894fe8b8f 100644 --- a/src/vmm/src/devices/pci_segment.rs +++ b/src/vmm/src/devices/pci_segment.rs @@ -59,24 +59,6 @@ impl Debug for PciSegment { } } -struct DummyDeviceRelocation; -impl DeviceRelocation for DummyDeviceRelocation { - fn move_bar( - &self, - old_base: u64, - new_base: u64, - len: u64, - _pci_dev: &mut dyn PciDevice, - _region_type: PciBarRegionType, - ) -> std::result::Result<(), io::Error> { - error!( - "Failed moving device BAR: 0x{:x}->0x{:x}(0x{:x})", - old_base, new_base, len - ); - Ok(()) - } -} - impl PciSegment { pub(crate) fn new( id: u16, @@ -85,11 +67,12 @@ impl PciSegment { mem64_allocator: Arc>, mmio_bus: &mut Bus, pci_irq_slots: &[u8; 32], + device_relocation: Arc, ) -> Result { let pci_root = PciRoot::new(None); let pci_bus = Arc::new(Mutex::new(PciBus::new( pci_root, - Arc::new(DummyDeviceRelocation{}) + device_relocation, ))); let pci_config_mmio = Arc::new(Mutex::new(BusDevice::MmioPciBus(PciConfigMmio::new(Arc::clone(&pci_bus))))); @@ -136,53 +119,53 @@ impl PciSegment { Ok(segment) } - #[cfg(target_arch = "x86_64")] - pub(crate) fn new_default_segment( - mem32_allocator: Arc>, - mem64_allocator: Arc>, - mmio_bus: &mut Bus, - io_bus: &mut Bus, - pci_irq_slots: &[u8; 32], - ) -> Result { - let mut segment = Self::new( - 0, - 0, - mem32_allocator, - mem64_allocator, - mmio_bus, - pci_irq_slots, - )?; - let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&segment.pci_bus))))); - - io_bus - .insert( - pci_config_io.clone(), - PCI_CONFIG_IO_PORT, - PCI_CONFIG_IO_PORT_SIZE, - ) - .map_err(|e| anyhow!("error adding pci bus to pio bus {e}"))?; - - segment.pci_config_io = Some(pci_config_io); - - Ok(segment) - } - - #[cfg(target_arch = "aarch64")] - pub(crate) fn new_default_segment( - address_manager: &Arc, - mem32_allocator: Arc>, - mem64_allocator: Arc>, - pci_irq_slots: &[u8; 32], - ) -> DeviceManagerResult { - Self::new( - 0, - 0, - address_manager, - mem32_allocator, - mem64_allocator, - pci_irq_slots, - ) - } + // #[cfg(target_arch = "x86_64")] + // pub(crate) fn new_default_segment( + // mem32_allocator: Arc>, + // mem64_allocator: Arc>, + // mmio_bus: &mut Bus, + // io_bus: &mut Bus, + // pci_irq_slots: &[u8; 32], + // ) -> Result { + // let mut segment = Self::new( + // 0, + // 0, + // mem32_allocator, + // mem64_allocator, + // mmio_bus, + // pci_irq_slots, + // )?; + // let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&segment.pci_bus))))); + + // io_bus + // .insert( + // pci_config_io.clone(), + // PCI_CONFIG_IO_PORT, + // PCI_CONFIG_IO_PORT_SIZE, + // ) + // .map_err(|e| anyhow!("error adding pci bus to pio bus {e}"))?; + + // segment.pci_config_io = Some(pci_config_io); + + // Ok(segment) + // } + + // #[cfg(target_arch = "aarch64")] + // pub(crate) fn new_default_segment( + // address_manager: &Arc, + // mem32_allocator: Arc>, + // mem64_allocator: Arc>, + // pci_irq_slots: &[u8; 32], + // ) -> DeviceManagerResult { + // Self::new( + // 0, + // 0, + // address_manager, + // mem32_allocator, + // mem64_allocator, + // pci_irq_slots, + // ) + // } pub(crate) fn next_device_bdf(&self) -> Result { Ok(PciBdf::new( diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 3bb0f836a97..4c819256f8f 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -124,9 +124,16 @@ use device_manager::acpi::ACPIDeviceManager; use device_manager::resources::ResourceAllocator; use devices::acpi::vmgenid::VmGenIdError; use devices::pci_segment::PciSegment; +use devices::virtio::transport::VirtioPciDevice; +use devices::Bus; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; +use kvm_ioctls::{IoEventAddress, NoDatamatch, VmFd}; +use pci::{DeviceRelocation, PciBarRegionType, PciDevice}; use seccompiler::BpfProgram; use userfaultfd::Uffd; +use vm_device::interrupt::{InterruptManager, MsiIrqGroupConfig}; +use vm_memory::{GuestAddress, GuestUsize}; +use vm_system_allocator::{AddressAllocator, SystemAllocator}; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::terminal::Terminal; @@ -265,6 +272,8 @@ pub enum VmmError { VmmObserverTeardown(vmm_sys_util::errno::Error), /// VMGenID error: {0} VMGenID(#[from] VmGenIdError), + /// Unknown + Unknown, } /// Shorthand type for KVM dirty page bitmap. @@ -352,7 +361,7 @@ impl Vmm { &self, device_type: DeviceType, device_id: &str, - ) -> Option<&Mutex> { + ) -> Option>> { self.mmio_device_manager.get_device(device_type, device_id) } @@ -1005,3 +1014,138 @@ impl MutEventSubscriber for Vmm { } } } + +struct AddressManager { + pub(crate) allocator: Arc>, + #[cfg(target_arch = "x86_64")] + pub(crate) io_bus: Arc, + pub(crate) mmio_bus: Arc, + pub(crate) vm: Arc>, + pci_mmio32_allocators: Vec>>, + pci_mmio64_allocators: Vec>>, +} + +// TODO implement this in a more granular way +impl DeviceRelocation for AddressManager { + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + pci_dev: &mut dyn PciDevice, + region_type: PciBarRegionType, + ) -> std::result::Result<(), std::io::Error> { + match region_type { + PciBarRegionType::IoRegion => { + #[cfg(target_arch = "x86_64")] + { + // Update system allocator + self.allocator + .lock() + .unwrap() + .free_io_addresses(GuestAddress(old_base), len as GuestUsize); + + self.allocator + .lock() + .unwrap() + .allocate_io_addresses( + Some(GuestAddress(new_base)), + len as GuestUsize, + None, + ) + .ok_or_else(|| { + io::Error::new(io::ErrorKind::Other, "failed allocating new IO range") + })?; + + // Update PIO bus + self.io_bus + .update_range(old_base, len, new_base, len) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + } + #[cfg(target_arch = "aarch64")] + error!("I/O region is not supported"); + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + let allocators = if region_type == PciBarRegionType::Memory32BitRegion { + &self.pci_mmio32_allocators + } else { + &self.pci_mmio64_allocators + }; + + // Find the specific allocator that this BAR was allocated from and use it for new one + for allocator in allocators { + let allocator_base = allocator.lock().unwrap().base(); + let allocator_end = allocator.lock().unwrap().end(); + + if old_base >= allocator_base.0 && old_base <= allocator_end.0 { + allocator + .lock() + .unwrap() + .free(GuestAddress(old_base), len as GuestUsize); + break; + } + } + + for allocator in allocators { + let allocator_base = allocator.lock().unwrap().base(); + let allocator_end = allocator.lock().unwrap().end(); + + if new_base >= allocator_base.0 && new_base <= allocator_end.0 { + allocator + .lock() + .unwrap() + .allocate(Some(GuestAddress(new_base)), len as GuestUsize, Some(len)) + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + "failed allocating new MMIO range", + ) + })?; + + break; + } + } + + // Update MMIO bus + self.mmio_bus + .update_range(old_base, len, new_base, len) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + } + } + + let any_dev = pci_dev.as_any(); + if let Some(virtio_pci_dev) = any_dev.downcast_ref::() { + let bar_addr = virtio_pci_dev.config_bar_addr(); + if bar_addr == new_base { + const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; + const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + + let notify_base = old_base + NOTIFICATION_BAR_OFFSET; + for (i, queue_evt) in virtio_pci_dev.virtio_device().lock().unwrap().queue_events().iter().enumerate() { + let addr = notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER); + let io_addr = IoEventAddress::Mmio(addr); + self.vm.lock().unwrap().unregister_ioevent(queue_evt, &io_addr, NoDatamatch).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to unregister ioevent: {e:?}"), + ) + })?; + } + for (i, queue_evt) in virtio_pci_dev.virtio_device().lock().unwrap().queue_events().iter().enumerate() { + let addr = notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER); + let io_addr = IoEventAddress::Mmio(addr); + self.vm.lock().unwrap() + .register_ioevent(queue_evt, &io_addr, NoDatamatch) + .map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("failed to register ioevent: {e:?}"), + ) + })?; + } + } + } + + pci_dev.move_bar(old_base, new_base) + } +} \ No newline at end of file From 78e8c7a58bc904191254513792cff6cb8ad93d03 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 14:28:53 +0100 Subject: [PATCH 17/22] virtio-pci: introduce VirtioInterrupt trait There's a bunch of work to make this work correctly. This should probably be done in a different way, but this is just a quick poc. Signed-off-by: Riccardo Mancini --- src/vmm/src/device_manager/mmio.rs | 21 ++++--- src/vmm/src/devices/virtio/balloon/device.rs | 53 ++++++++-------- .../devices/virtio/balloon/event_handler.rs | 2 +- .../src/devices/virtio/balloon/test_utils.rs | 2 +- src/vmm/src/devices/virtio/block/device.rs | 16 ++--- .../devices/virtio/block/vhost_user/device.rs | 28 +++++---- .../src/devices/virtio/block/virtio/device.rs | 63 ++++++++++--------- .../virtio/block/virtio/event_handler.rs | 2 +- .../devices/virtio/block/virtio/test_utils.rs | 8 +-- src/vmm/src/devices/virtio/device.rs | 52 ++++++++++++--- src/vmm/src/devices/virtio/net/device.rs | 61 +++++++++--------- src/vmm/src/devices/virtio/net/test_utils.rs | 6 +- src/vmm/src/devices/virtio/rng/device.rs | 29 +++++---- src/vmm/src/devices/virtio/test_utils.rs | 2 +- src/vmm/src/devices/virtio/vhost_user.rs | 13 ++-- src/vmm/src/devices/virtio/vsock/device.rs | 25 ++++---- .../src/devices/virtio/vsock/event_handler.rs | 35 +++++++---- .../src/devices/virtio/vsock/test_utils.rs | 7 ++- 18 files changed, 252 insertions(+), 173 deletions(-) diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 0c94f789610..287c3e9d814 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -29,7 +29,7 @@ use crate::devices::legacy::RTCDevice; use crate::devices::pseudo::BootTimer; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; -use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::device::{VirtioDevice, VirtioInterruptType}; use crate::devices::virtio::transport::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; @@ -209,7 +209,9 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIoEvent)?; } vm.register_irqfd( - &locked_device.interrupt_trigger().irq_evt, + &locked_device.interrupt() + .notifier(VirtioInterruptType::Queue(0)) + .expect("mmio device should have evenfd"), device_info.irqs[0], ) .map_err(MmioError::RegisterIrqFd)?; @@ -509,7 +511,8 @@ impl MMIODeviceManager { .unwrap(); if vsock.is_activated() { info!("kick vsock {id}."); - vsock.signal_used_queue().unwrap(); + // TODO should we kick rx as well? + vsock.signal_used_queue(1).unwrap(); } } TYPE_RNG => { @@ -547,7 +550,7 @@ mod tests { use vmm_sys_util::eventfd::EventFd; use super::*; - use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; + use crate::devices::virtio::device::{IrqTrigger, VirtioDevice, VirtioInterrupt}; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::ActivateError; use crate::test_utils::multi_region_mem; @@ -594,7 +597,7 @@ mod tests { dummy: u32, queues: Vec, queue_evts: [EventFd; 1], - interrupt_trigger: IrqTrigger, + interrupt: Arc, } impl DummyDevice { @@ -603,7 +606,7 @@ mod tests { dummy: 0, queues: QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(), queue_evts: [EventFd::new(libc::EFD_NONBLOCK).expect("cannot create eventFD")], - interrupt_trigger: IrqTrigger::new().expect("cannot create eventFD"), + interrupt: Arc::new(IrqTrigger::new().expect("cannot create eventFD")), } } } @@ -635,8 +638,8 @@ mod tests { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.interrupt_trigger + fn interrupt(&self) -> Arc { + self.interrupt.clone() } fn ack_features_by_page(&mut self, page: u32, value: u32) { @@ -654,7 +657,7 @@ mod tests { let _ = data; } - fn activate(&mut self, _: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, _: GuestMemoryMmap, _: Option>) -> Result<(), ActivateError> { Ok(()) } diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 697928ae9c6..f3e706719a1 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; +use std::sync::Arc; use std::time::Duration; use log::error; @@ -24,7 +25,7 @@ use super::{ VIRTIO_BALLOON_S_SWAP_OUT, }; use crate::devices::virtio::balloon::BalloonError; -use crate::devices::virtio::device::{IrqTrigger, IrqType}; +use crate::devices::virtio::device::{IrqTrigger, IrqType, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::gen::virtio_blk::VIRTIO_F_VERSION_1; use crate::logger::IncMetric; use crate::utils::u64_to_usize; @@ -161,7 +162,7 @@ pub struct Balloon { pub(crate) queues: Vec, pub(crate) queue_evts: [EventFd; BALLOON_NUM_QUEUES], pub(crate) device_state: DeviceState, - pub(crate) irq_trigger: IrqTrigger, + pub(crate) virtio_interrupt: Option>, // Implementation specific fields. pub(crate) restored: bool, @@ -188,7 +189,6 @@ impl fmt::Debug for Balloon { .field("queues", &self.queues) .field("queue_evts", &self.queue_evts) .field("device_state", &self.device_state) - .field("irq_trigger", &self.irq_trigger) .field("restored", &self.restored) .field("stats_polling_interval_s", &self.stats_polling_interval_s) .field("stats_desc_index", &self.stats_desc_index) @@ -242,7 +242,7 @@ impl Balloon { }, queue_evts, queues, - irq_trigger: IrqTrigger::new().map_err(BalloonError::EventFd)?, + virtio_interrupt: Some(Arc::new(IrqTrigger::new().map_err(BalloonError::EventFd)?)), device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, restored, @@ -363,7 +363,7 @@ impl Balloon { } if needs_interrupt { - self.signal_used_queue()?; + self.signal_used_queue(INFLATE_INDEX)?; } Ok(()) @@ -381,7 +381,7 @@ impl Balloon { } if needs_interrupt { - self.signal_used_queue() + self.signal_used_queue(DEFLATE_INDEX) } else { Ok(()) } @@ -425,11 +425,13 @@ impl Balloon { Ok(()) } - pub(crate) fn signal_used_queue(&self) -> Result<(), BalloonError> { - self.irq_trigger.trigger_irq(IrqType::Vring).map_err(|err| { - METRICS.event_fails.inc(); - BalloonError::InterruptError(err) - }) + pub(crate) fn signal_used_queue(&self, queue_index: usize) -> Result<(), BalloonError> { + self.virtio_interrupt.as_ref().expect("queue should be initialized") + .trigger(VirtioInterruptType::Queue(queue_index as u16)).map_err(|err| { + METRICS.event_fails.inc(); + BalloonError::InterruptError(err) + } + ) } /// Process device virtio queue(s). @@ -450,7 +452,7 @@ impl Balloon { self.queues[STATS_INDEX] .add_used(index, 0) .map_err(BalloonError::Queue)?; - self.signal_used_queue() + self.signal_used_queue(STATS_INDEX) } else { error!("Failed to update balloon stats, missing descriptor."); Ok(()) @@ -461,8 +463,8 @@ impl Balloon { pub fn update_size(&mut self, amount_mib: u32) -> Result<(), BalloonError> { if self.is_activated() { self.config_space.num_pages = mib_to_pages(amount_mib)?; - self.irq_trigger - .trigger_irq(IrqType::Config) + self.virtio_interrupt.as_ref().expect("queue should be initialized") + .trigger(VirtioInterruptType::Config) .map_err(BalloonError::InterruptError) } else { Err(BalloonError::DeviceNotActive) @@ -573,8 +575,8 @@ impl VirtioDevice for Balloon { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("queue should be initialized").clone() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -601,7 +603,8 @@ impl VirtioDevice for Balloon { dst.copy_from_slice(data); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -816,7 +819,7 @@ pub(crate) mod tests { // Only initialize the inflate queue to demonstrate invalid request handling. let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), None).unwrap(); // Fill the second page with non-zero bytes. for i in 0..0x1000 { @@ -874,7 +877,7 @@ pub(crate) mod tests { let mem = default_mem(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), None).unwrap(); // Fill the third page with non-zero bytes. for i in 0..0x1000 { @@ -944,7 +947,7 @@ pub(crate) mod tests { let mem = default_mem(); let defq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(DEFLATE_INDEX, defq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), None).unwrap(); let page_addr = 0x10; @@ -992,7 +995,7 @@ pub(crate) mod tests { let mem = default_mem(); let statsq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(STATS_INDEX, statsq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), None).unwrap(); let page_addr = 0x100; @@ -1068,7 +1071,7 @@ pub(crate) mod tests { assert!(balloon.stats_desc_index.is_some()); balloon.process_stats_timer_event().unwrap(); assert!(balloon.stats_desc_index.is_none()); - assert!(balloon.irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(balloon.irq_trigger.has_pending_irq(IrqType::Vring)); }); } } @@ -1083,7 +1086,7 @@ pub(crate) mod tests { balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, defq.create_queue()); - balloon.activate(mem).unwrap(); + balloon.activate(mem, None).unwrap(); balloon.process_virtio_queues() } @@ -1091,7 +1094,7 @@ pub(crate) mod tests { fn test_update_stats_interval() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); - balloon.activate(mem).unwrap(); + balloon.activate(mem, None).unwrap(); assert_eq!( format!("{:?}", balloon.update_stats_polling_interval(1)), "Err(StatisticsStateChange)" @@ -1100,7 +1103,7 @@ pub(crate) mod tests { let mut balloon = Balloon::new(0, true, 1, false).unwrap(); let mem = default_mem(); - balloon.activate(mem).unwrap(); + balloon.activate(mem, None).unwrap(); assert_eq!( format!("{:?}", balloon.update_stats_polling_interval(0)), "Err(StatisticsStateChange)" diff --git a/src/vmm/src/devices/virtio/balloon/event_handler.rs b/src/vmm/src/devices/virtio/balloon/event_handler.rs index 3019d6877de..fd75f466a3b 100644 --- a/src/vmm/src/devices/virtio/balloon/event_handler.rs +++ b/src/vmm/src/devices/virtio/balloon/event_handler.rs @@ -177,7 +177,7 @@ pub mod tests { } // Now activate the device. - balloon.lock().unwrap().activate(mem.clone()).unwrap(); + balloon.lock().unwrap().activate(mem.clone(), None).unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/balloon/test_utils.rs b/src/vmm/src/devices/virtio/balloon/test_utils.rs index 8968aa70915..a7cdbed23e6 100644 --- a/src/vmm/src/devices/virtio/balloon/test_utils.rs +++ b/src/vmm/src/devices/virtio/balloon/test_utils.rs @@ -23,7 +23,7 @@ pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { _ => unreachable!(), }; // Validate the queue operation finished successfully. - assert!(b.irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(b.irq_trigger.has_pending_irq(IrqType::Vring)); } pub fn set_request(queue: &VirtQueue, idx: u16, addr: u64, len: u32, flags: u16) { diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 7399fe39a0b..5d11c6cbf41 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -1,6 +1,8 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; + use event_manager::{EventOps, Events, MutEventSubscriber}; use vmm_sys_util::eventfd::EventFd; @@ -8,7 +10,7 @@ use super::persist::{BlockConstructorArgs, BlockState}; use super::vhost_user::device::{VhostUserBlock, VhostUserBlockConfig}; use super::virtio::device::{VirtioBlock, VirtioBlockConfig}; use super::BlockError; -use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; +use crate::devices::virtio::device::{IrqTrigger, VirtioDevice, VirtioInterrupt}; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::rate_limiter::BucketUpdate; @@ -173,10 +175,10 @@ impl VirtioDevice for Block { } } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt(&self) -> Arc { match self { - Self::Virtio(b) => &b.irq_trigger, - Self::VhostUser(b) => &b.irq_trigger, + Self::Virtio(b) => b.interrupt(), + Self::VhostUser(b) => b.interrupt(), } } @@ -194,10 +196,10 @@ impl VirtioDevice for Block { } } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { match self { - Self::Virtio(b) => b.activate(mem), - Self::VhostUser(b) => b.activate(mem), + Self::Virtio(b) => b.activate(mem, virtio_interrupt), + Self::VhostUser(b) => b.activate(mem, virtio_interrupt), } } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 62218157c8b..4065311d4ea 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -14,6 +14,8 @@ use vmm_sys_util::eventfd::EventFd; use super::{VhostUserBlockError, NUM_QUEUES, QUEUE_SIZE}; use crate::devices::virtio::block::CacheType; +use crate::devices::virtio::device::VirtioInterrupt; +use crate::devices::virtio::device::VirtioInterruptType; use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; use crate::devices::virtio::gen::virtio_blk::{ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_F_VERSION_1, @@ -118,7 +120,7 @@ pub struct VhostUserBlockImpl { pub queues: Vec, pub queue_evts: [EventFd; u64_to_usize(NUM_QUEUES)], pub device_state: DeviceState, - pub irq_trigger: IrqTrigger, + pub virtio_interrupt: Option>, // Implementation specific fields. pub id: String, @@ -144,7 +146,6 @@ impl std::fmt::Debug for VhostUserBlockImpl { .field("queues", &self.queues) .field("queue_evts", &self.queue_evts) .field("device_state", &self.device_state) - .field("irq_trigger", &self.irq_trigger) .field("id", &self.id) .field("partuuid", &self.partuuid) .field("cache_type", &self.cache_type) @@ -204,7 +205,7 @@ impl VhostUserBlockImpl { let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VhostUserBlockError::EventFd)?; u64_to_usize(NUM_QUEUES)]; let device_state = DeviceState::Inactive; - let irq_trigger = IrqTrigger::new().map_err(VhostUserBlockError::IrqTrigger)?; + let virtio_interrupt: Option> = Some(Arc::new(IrqTrigger::new().map_err(VhostUserBlockError::IrqTrigger)?)); // We negotiated features with backend. Now these acked_features // are available for guest driver to choose from. @@ -226,7 +227,7 @@ impl VhostUserBlockImpl { queues, queue_evts, device_state, - irq_trigger, + virtio_interrupt, id: config.drive_id, partuuid: config.partuuid, @@ -271,8 +272,9 @@ impl VhostUserBlockImpl { ) .map_err(VhostUserBlockError::Vhost)?; self.config_space = new_config_space; - self.irq_trigger - .trigger_irq(IrqType::Config) + self.virtio_interrupt.as_ref() + .expect("interrupt must be set up") + .trigger(VirtioInterruptType::Config) .map_err(VhostUserBlockError::IrqTrigger)?; let delta_us = get_time_us(ClockType::Monotonic) - start_time; @@ -311,8 +313,8 @@ impl VirtioDevice for VhostUserBlock &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -331,7 +333,9 @@ impl VirtioDevice for VhostUserBlock // Other block config fields are immutable. } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); + for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -346,7 +350,7 @@ impl VirtioDevice for VhostUserBlock self.vu_handle.setup_backend( &mem, &[(0, &self.queues[0], &self.queue_evts[0])], - &self.irq_trigger, + self.interrupt(), ) }) .map_err(|err| { @@ -376,7 +380,7 @@ mod tests { use super::*; use crate::devices::virtio::block::virtio::device::FileEngineType; - use crate::devices::virtio::mmio::VIRTIO_MMIO_INT_CONFIG; + use crate::devices::virtio::transport::mmio::VIRTIO_MMIO_INT_CONFIG; use crate::test_utils::create_tmp_socket; use crate::vstate::memory::{FileOffset, GuestAddress, GuestMemoryExtension}; @@ -786,7 +790,7 @@ mod tests { let guest_memory = GuestMemoryMmap::from_raw_regions_file(regions, false, false).unwrap(); // During actiavion of the device features, memory and queues should be set and activated. - vhost_block.activate(guest_memory).unwrap(); + vhost_block.activate(guest_memory, None).unwrap(); assert!(unsafe { *vhost_block.vu_handle.vu.features_are_set.get() }); assert!(unsafe { *vhost_block.vu_handle.vu.memory_is_set.get() }); assert!(unsafe { *vhost_block.vu_handle.vu.vring_enabled.get() }); diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index fd352fe2539..1106280dbea 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -25,14 +25,14 @@ use super::{ }; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; use crate::devices::virtio::block::CacheType; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::gen::virtio_blk::{ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, VIRTIO_F_VERSION_1, }; use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; -use crate::logger::{error, warn, IncMetric}; +use crate::logger::{error, warn, debug, IncMetric}; use crate::rate_limiter::{BucketUpdate, RateLimiter}; use crate::utils::u64_to_usize; use crate::vmm_config::drive::BlockDeviceConfig; @@ -253,7 +253,7 @@ pub struct VirtioBlock { pub queues: Vec, pub queue_evts: [EventFd; 1], pub device_state: DeviceState, - pub irq_trigger: IrqTrigger, + pub virtio_interrupt: Option>, // Implementation specific fields. pub id: String, @@ -322,7 +322,7 @@ impl VirtioBlock { queues, queue_evts, device_state: DeviceState::Inactive, - irq_trigger: IrqTrigger::new().map_err(VirtioBlockError::IrqTrigger)?, + virtio_interrupt: Some(Arc::new(IrqTrigger::new().map_err(VirtioBlockError::IrqTrigger)?)), id: config.drive_id.clone(), partuuid: config.partuuid, @@ -385,10 +385,11 @@ impl VirtioBlock { } fn add_used_descriptor( + queue_index: usize, queue: &mut Queue, index: u16, len: u32, - irq_trigger: &IrqTrigger, + interrupt: Arc, block_metrics: &BlockDeviceMetrics, ) { queue.add_used(index, len).unwrap_or_else(|err| { @@ -396,7 +397,7 @@ impl VirtioBlock { }); if queue.prepare_kick() { - irq_trigger.trigger_irq(IrqType::Vring).unwrap_or_else(|_| { + interrupt.trigger(VirtioInterruptType::Queue(queue_index as u16)).unwrap_or_else(|_| { block_metrics.event_fails.inc(); }); } @@ -444,10 +445,11 @@ impl VirtioBlock { } ProcessingResult::Executed(finished) => { Self::add_used_descriptor( + queue_index, queue, head.index, finished.num_bytes_to_mem, - &self.irq_trigger, + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone(), &self.metrics, ); } @@ -470,7 +472,8 @@ impl VirtioBlock { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); - let queue = &mut self.queues[0]; + let queue_index = 0; + let queue = &mut self.queues[queue_index]; loop { match engine.pop(mem) { @@ -495,10 +498,11 @@ impl VirtioBlock { let finished = pending.finish(mem, res, &self.metrics); Self::add_used_descriptor( + queue_index, queue, finished.desc_idx, finished.num_bytes_to_mem, - &self.irq_trigger, + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone(), &self.metrics, ); } @@ -527,7 +531,7 @@ impl VirtioBlock { self.config_space = self.disk.virtio_block_config_space(); // Kick the driver to pick up the changes. - self.irq_trigger.trigger_irq(IrqType::Config).unwrap(); + self.virtio_interrupt.as_ref().expect("interrupt must be set up").trigger(VirtioInterruptType::Config).unwrap(); self.metrics.update_count.inc(); Ok(()) @@ -594,8 +598,8 @@ impl VirtioDevice for VirtioBlock { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone() } fn read_config(&self, offset: u64, mut data: &mut [u8]) { @@ -629,7 +633,9 @@ impl VirtioDevice for VirtioBlock { dst.copy_from_slice(data); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); + for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -647,6 +653,7 @@ impl VirtioDevice for VirtioBlock { return Err(ActivateError::EventFd); } self.device_state = DeviceState::Activated(mem); + debug!("VirtioBlock activated"); Ok(()) } @@ -866,7 +873,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -894,7 +901,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -957,7 +964,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1008,7 +1015,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1040,7 +1047,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); vq.dtable[1].set(0xf000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); @@ -1076,7 +1083,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1123,7 +1130,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1362,7 +1369,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); vq.dtable[1].set(0xff00, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); @@ -1403,7 +1410,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1449,7 +1456,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1572,7 +1579,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, IO_URING_NUM_ENTRIES * 4); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); // Run scenario that doesn't trigger FullSq BlockError: Add sq_size flush requests. add_flush_requests_batch(&mut block, &vq, IO_URING_NUM_ENTRIES); @@ -1605,7 +1612,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, IO_URING_NUM_ENTRIES * 4); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); // Run scenario that triggers FullCqError. Push 2 * IO_URING_NUM_ENTRIES and wait for // completion. Then try to push another entry. @@ -1634,7 +1641,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); // Add a batch of flush requests. add_flush_requests_batch(&mut block, &vq, 5); @@ -1653,7 +1660,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1722,7 +1729,7 @@ mod tests { let mem = default_mem(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), None).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); diff --git a/src/vmm/src/devices/virtio/block/virtio/event_handler.rs b/src/vmm/src/devices/virtio/block/virtio/event_handler.rs index 8400766e06b..52dd35838c0 100644 --- a/src/vmm/src/devices/virtio/block/virtio/event_handler.rs +++ b/src/vmm/src/devices/virtio/block/virtio/event_handler.rs @@ -162,7 +162,7 @@ mod tests { assert_eq!(ev_count, 0); // Now activate the device. - block.lock().unwrap().activate(mem.clone()).unwrap(); + block.lock().unwrap().activate(mem.clone(), None).unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index 106da8177cd..8d902fcbd2a 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -81,9 +81,9 @@ pub fn simulate_queue_event(b: &mut VirtioBlock, maybe_expected_irq: Option std::result::Result<(), std::io::Error>; + fn notifier(&self, _int_type: VirtioInterruptType) -> Option { + None + } + // TODO hack to make it backwards compatible with IrqInterrupt + fn status(&self) -> Arc { + Arc::new(AtomicU32::new(0)) + } +} + +impl Debug for dyn VirtioInterrupt { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "VirtioInterrupt") + } +} /// Enum that indicates if a VirtioDevice is inactive or has been activated /// and memory attached to it. @@ -84,6 +105,23 @@ impl IrqTrigger { } } +impl VirtioInterrupt for IrqTrigger { + fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { + match int_type { + VirtioInterruptType::Config => self.trigger_irq(IrqType::Config), + VirtioInterruptType::Queue(_) => self.trigger_irq(IrqType::Vring), + } + } + + fn notifier(&self, _int_type: VirtioInterruptType) -> Option { + Some(self.irq_evt.try_clone().ok()?) + } + + fn status(&self) -> Arc { + self.irq_status.clone() + } +} + /// Trait for virtio devices to be driven by a virtio transport. /// /// The lifecycle of a virtio device is to be moved to a virtio transport, which will then query the @@ -121,10 +159,10 @@ pub trait VirtioDevice: AsAny + Send { /// Returns the current device interrupt status. fn interrupt_status(&self) -> Arc { - Arc::clone(&self.interrupt_trigger().irq_status) + self.interrupt().status().clone() } - fn interrupt_trigger(&self) -> &IrqTrigger; + fn interrupt(&self) -> Arc; /// The set of feature bits shifted by `page * 32`. fn avail_features_by_page(&self, page: u32) -> u32 { @@ -170,14 +208,14 @@ pub trait VirtioDevice: AsAny + Send { fn write_config(&mut self, offset: u64, data: &[u8]); /// Performs the formal activation for a device, which can be verified also with `is_activated`. - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError>; + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError>; /// Checks if the resources of this device are activated. fn is_activated(&self) -> bool; /// Optionally deactivates this device and returns ownership of the guest memory map, interrupt /// event, and queue events. - fn reset(&mut self) -> Option<(EventFd, Vec)> { + fn reset(&mut self) -> Option<(Arc, Vec)> { None } @@ -275,7 +313,7 @@ pub(crate) mod tests { todo!() } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt(&self) -> Arc { todo!() } @@ -287,7 +325,7 @@ pub(crate) mod tests { todo!() } - fn activate(&mut self, _mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, _mem: GuestMemoryMmap, _virtio_interrupt: Option>) -> Result<(), ActivateError> { todo!() } diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index deb6976b2af..7d72b628191 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -14,7 +14,7 @@ use libc::{iovec, EAGAIN}; use log::error; use vmm_sys_util::eventfd::EventFd; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::gen::virtio_blk::VIRTIO_F_VERSION_1; use crate::devices::virtio::gen::virtio_net::{ virtio_net_hdr_v1, VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, @@ -247,7 +247,7 @@ pub struct Net { tx_frame_headers: [u8; frame_hdr_len()], - pub(crate) irq_trigger: IrqTrigger, + pub(crate) virtio_interrupt: Option>, pub(crate) config_space: ConfigSpace, pub(crate) guest_mac: Option, @@ -311,7 +311,7 @@ impl Net { tx_rate_limiter, rx_frame_buf: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], - irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?, + virtio_interrupt: Some(Arc::new(IrqTrigger::new().map_err(NetError::EventFd)?)), config_space, guest_mac, device_state: DeviceState::Inactive, @@ -390,14 +390,15 @@ impl Net { /// https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-320005 /// 2.6.7.1 Driver Requirements: Used Buffer Notification Suppression fn try_signal_queue(&mut self, queue_type: NetQueue) -> Result<(), DeviceError> { - let queue = match queue_type { - NetQueue::Rx => &mut self.queues[RX_INDEX], - NetQueue::Tx => &mut self.queues[TX_INDEX], + let queue_index = match queue_type { + NetQueue::Rx => RX_INDEX, + NetQueue::Tx => TX_INDEX, }; + let queue = &mut self.queues[queue_index]; if queue.prepare_kick() { - self.irq_trigger - .trigger_irq(IrqType::Vring) + self.virtio_interrupt.as_ref().expect("interrupt must be setup") + .trigger(VirtioInterruptType::Queue(queue_index as u16)) .map_err(|err| { self.metrics.event_fails.inc(); DeviceError::FailedSignalingIrq(err) @@ -961,8 +962,8 @@ impl VirtioDevice for Net { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone() } fn read_config(&self, offset: u64, data: &mut [u8]) { if let Some(config_space_bytes) = self.config_space.as_slice().get(u64_to_usize(offset)..) { @@ -992,7 +993,9 @@ impl VirtioDevice for Net { self.metrics.mac_address_updates.inc(); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); + for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -1391,7 +1394,7 @@ pub mod tests { // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 4); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // Check that the invalid descriptor chains have been discarded th.rxq.check_used_elem(0, 0, 0); th.rxq.check_used_elem(1, 3, 0); @@ -1448,7 +1451,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_descriptors == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // Check that the frame has been written successfully to the Rx descriptor chain. header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq @@ -1511,7 +1514,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // Check that the 1st frame was written successfully to the 1st Rx descriptor chain. header_set_num_buffers(frame_1.as_mut_slice(), 1); th.rxq @@ -1569,7 +1572,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // 2 chains should be used for the packet. header_set_num_buffers(frame.as_mut_slice(), 2); @@ -1634,7 +1637,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1657,7 +1660,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1684,7 +1687,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1707,7 +1710,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1746,7 +1749,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 4); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(3, 4, 0); // Check that the valid frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1777,7 +1780,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 3, 0); // Check that the frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1806,7 +1809,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); } @@ -1834,7 +1837,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); th.txq.check_used_elem(1, 3, 0); // Check that the first frame was sent to the tap. @@ -2181,7 +2184,7 @@ pub mod tests { assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2209,7 +2212,7 @@ pub mod tests { // validate the rate_limiter is no longer blocked assert!(!th.net().rx_rate_limiter.is_blocked()); // make sure the virtio queue operation completed this time - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2306,14 +2309,14 @@ pub mod tests { assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); // trigger the RX handler again, this time it should do the limiter fast path exit th.simulate_event(NetEvent::Tap); // assert that no operation actually completed, that the limiter blocked it - assert!(!&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(!&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2326,7 +2329,7 @@ pub mod tests { { th.simulate_event(NetEvent::RxRateLimiter); // make sure the virtio queue operation completed this time - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2396,7 +2399,7 @@ pub mod tests { assert_eq!(net.queue_events().len(), NET_QUEUE_SIZES.len()); // Test interrupts. - assert!(!&net.irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(!&net.irq_trigger.has_pending_irq(IrqType::Vring)); } #[test] diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index ffe7bbc7279..14108503914 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -372,7 +372,7 @@ pub mod test { } pub fn activate_net(&mut self) { - self.net.lock().unwrap().activate(self.mem.clone()).unwrap(); + self.net.lock().unwrap().activate(self.mem.clone(), None).unwrap(); // Process the activate event. let ev_count = self.event_manager.run_with_timeout(100).unwrap(); assert_eq!(ev_count, 1); @@ -449,7 +449,7 @@ pub mod test { old_used_descriptors + 1 ); - assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); frame } @@ -475,7 +475,7 @@ pub mod test { ); // Check that the expected frame was sent to the Rx queue eventually. assert_eq!(self.rxq.used.idx.get(), used_idx + 1); - assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); + // assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); self.rxq .check_used_elem(used_idx, 0, expected_frame.len().try_into().unwrap()); self.rxq.dtable[0].check_data(expected_frame); diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 664fad5724d..bf49668fbaf 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -6,12 +6,13 @@ use std::sync::atomic::AtomicU32; use std::sync::Arc; use aws_lc_rs::rand; +use libc::IWEVEXPIRED; use vm_memory::GuestMemoryError; use vmm_sys_util::eventfd::EventFd; use super::metrics::METRICS; use super::{RNG_NUM_QUEUES, RNG_QUEUE}; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::gen::virtio_rng::VIRTIO_F_VERSION_1; use crate::devices::virtio::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecBufferMut; @@ -47,7 +48,7 @@ pub struct Entropy { device_state: DeviceState, pub(crate) queues: Vec, queue_events: Vec, - irq_trigger: IrqTrigger, + virtio_interrupt: Option>, // Device specific fields rate_limiter: RateLimiter, @@ -78,7 +79,7 @@ impl Entropy { device_state: DeviceState::Inactive, queues, queue_events, - irq_trigger, + virtio_interrupt: Some(Arc::new(irq_trigger)), rate_limiter, buffer: IoVecBufferMut::new()?, }) @@ -88,9 +89,9 @@ impl Entropy { ENTROPY_DEV_ID } - fn signal_used_queue(&self) -> Result<(), DeviceError> { - self.irq_trigger - .trigger_irq(IrqType::Vring) + fn signal_used_queue(&self, queue_index: usize) -> Result<(), DeviceError> { + self.interrupt() + .trigger(VirtioInterruptType::Queue(queue_index as u16)) .map_err(DeviceError::FailedSignalingIrq) } @@ -188,7 +189,7 @@ impl Entropy { } if used_any { - self.signal_used_queue().unwrap_or_else(|err| { + self.signal_used_queue(RNG_QUEUE).unwrap_or_else(|err| { error!("entropy: {err:?}"); METRICS.entropy_event_fails.inc() }); @@ -237,9 +238,9 @@ impl Entropy { self.acked_features = features; } - pub(crate) fn set_irq_status(&mut self, status: u32) { - self.irq_trigger.irq_status = Arc::new(AtomicU32::new(status)); - } + // pub(crate) fn set_irq_status(&mut self, status: u32) { + // self.irq_trigger.irq_status = Arc::new(AtomicU32::new(status)); + // } pub(crate) fn set_activated(&mut self, mem: GuestMemoryMmap) { self.device_state = DeviceState::Activated(mem); @@ -267,8 +268,8 @@ impl VirtioDevice for Entropy { &self.queue_events } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone() } fn avail_features(&self) -> u64 { @@ -291,7 +292,9 @@ impl VirtioDevice for Entropy { self.device_state.is_activated() } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); + for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; diff --git a/src/vmm/src/devices/virtio/test_utils.rs b/src/vmm/src/devices/virtio/test_utils.rs index 9bb66db82ae..b25acc56cbc 100644 --- a/src/vmm/src/devices/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/test_utils.rs @@ -414,7 +414,7 @@ pub(crate) mod test { /// Activate the device pub fn activate_device(&mut self, mem: &'a GuestMemoryMmap) { - self.device.lock().unwrap().activate(mem.clone()).unwrap(); + self.device.lock().unwrap().activate(mem.clone(), None).unwrap(); // Process the activate event let ev_count = self.event_manager.run_with_timeout(100).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index ad86c9942af..49f1b7b525a 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -6,6 +6,7 @@ use std::os::fd::AsRawFd; use std::os::unix::net::UnixStream; +use std::sync::Arc; use vhost::vhost_user::message::*; use vhost::vhost_user::{Frontend, VhostUserFrontend}; @@ -17,6 +18,8 @@ use crate::devices::virtio::device::IrqTrigger; use crate::devices::virtio::queue::Queue; use crate::vstate::memory::GuestMemoryMmap; +use super::device::{VirtioInterrupt, VirtioInterruptType}; + /// vhost-user error. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum VhostUserError { @@ -400,7 +403,7 @@ impl VhostUserHandleImpl { &mut self, mem: &GuestMemoryMmap, queues: &[(usize, &Queue, &EventFd)], - irq_trigger: &IrqTrigger, + interrupt: Arc, ) -> Result<(), VhostUserError> { // Provide the memory table to the backend. self.update_mem_table(mem)?; @@ -442,7 +445,7 @@ impl VhostUserHandleImpl { // No matter the queue, we set irq_evt for signaling the guest that buffers were // consumed. self.vu - .set_vring_call(*queue_index, &irq_trigger.irq_evt) + .set_vring_call(*queue_index, &interrupt.notifier(VirtioInterruptType::Queue(*queue_index as u16)).expect("vring irq should be initialized")) .map_err(VhostUserError::VhostUserSetVringCall)?; self.vu @@ -895,11 +898,11 @@ mod tests { queue.initialize(&guest_memory).unwrap(); let event_fd = EventFd::new(0).unwrap(); - let irq_trigger = IrqTrigger::new().unwrap(); + let interrupt = Arc::new(IrqTrigger::new().unwrap()); let queues = [(0, &queue, &event_fd)]; - vuh.setup_backend(&guest_memory, &queues, &irq_trigger) + vuh.setup_backend(&guest_memory, &queues, interrupt.clone()) .unwrap(); // VhostUserHandleImpl should correctly send memory and queues information to @@ -923,7 +926,7 @@ mod tests { log_addr: None, }, base: queue.avail_ring_idx_get(), - call: irq_trigger.irq_evt.as_raw_fd(), + call: interrupt.notifier(VirtioInterruptType::Queue(0)).expect("vring irq should be initialized").as_raw_fd(), kick: event_fd.as_raw_fd(), enable: true, }; diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index bf438aca99f..9d305559924 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -21,6 +21,7 @@ //! - a backend FD. use std::fmt::Debug; +use std::sync::Arc; use log::{error, warn}; use vmm_sys_util::eventfd::EventFd; @@ -29,7 +30,7 @@ use super::super::super::DeviceError; use super::defs::uapi; use super::packet::{VsockPacketRx, VsockPacketTx, VSOCK_PKT_HDR_SIZE}; use super::{defs, VsockBackend}; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice, VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::queue::Queue as VirtQueue; use crate::devices::virtio::vsock::metrics::METRICS; use crate::devices::virtio::vsock::VsockError; @@ -60,7 +61,7 @@ pub struct Vsock { pub(crate) backend: B, pub(crate) avail_features: u64, pub(crate) acked_features: u64, - pub(crate) irq_trigger: IrqTrigger, + pub(crate) virtio_interrupt: Option>, // This EventFd is the only one initially registered for a vsock device, and is used to convert // a VirtioDevice::activate call into an EventHandler read event which allows the other events // (queue and backend related) to be registered post virtio device activation. That's @@ -101,7 +102,7 @@ where backend, avail_features: AVAIL_FEATURES, acked_features: 0, - irq_trigger: IrqTrigger::new().map_err(VsockError::EventFd)?, + virtio_interrupt: Some(Arc::new(IrqTrigger::new().map_err(VsockError::EventFd)?)), activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(VsockError::EventFd)?, device_state: DeviceState::Inactive, rx_packet: VsockPacketRx::new()?, @@ -135,9 +136,9 @@ where /// Signal the guest driver that we've used some virtio buffers that it had previously made /// available. - pub fn signal_used_queue(&self) -> Result<(), DeviceError> { - self.irq_trigger - .trigger_irq(IrqType::Vring) + pub fn signal_used_queue(&self, queue_index: usize) -> Result<(), DeviceError> { + self.virtio_interrupt.as_ref().expect("interrupt should be setup") + .trigger(VirtioInterruptType::Queue(queue_index as u16)) .map_err(DeviceError::FailedSignalingIrq) } @@ -257,7 +258,7 @@ where error!("Failed to add used descriptor {}: {}", head.index, err); }); - self.signal_used_queue()?; + self.signal_used_queue(EVQ_INDEX)?; Ok(()) } @@ -295,8 +296,8 @@ where &self.queue_events } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt(&self) -> Arc { + self.virtio_interrupt.as_ref().expect("interrupt must be set up").clone() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -328,7 +329,9 @@ where ); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate(&mut self, mem: GuestMemoryMmap, virtio_interrupt: Option>) -> Result<(), ActivateError> { + self.virtio_interrupt = virtio_interrupt.or(self.virtio_interrupt.take()); + for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -430,6 +433,6 @@ mod tests { // } // Test a correct activation. - ctx.device.activate(ctx.mem.clone()).unwrap(); + ctx.device.activate(ctx.mem.clone(), None).unwrap(); } } diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index 40e75d1a9f5..e91ac707d3d 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -191,8 +191,17 @@ where Self::PROCESS_NOTIFY_BACKEND => raise_irq = self.notify_backend(evset), _ => warn!("Unexpected vsock event received: {:?}", source), } + let mut queue_index = 0; + match source { + Self::PROCESS_ACTIVATE => self.handle_activate_event(ops), + Self::PROCESS_RXQ => queue_index = RXQ_INDEX, + Self::PROCESS_TXQ => queue_index = TXQ_INDEX, + Self::PROCESS_EVQ => queue_index = EVQ_INDEX, + Self::PROCESS_NOTIFY_BACKEND => queue_index = TXQ_INDEX, // TODO this could be either tx or rx + _ => warn!("Unexpected vsock event received: {:?}", source), + } if raise_irq { - self.signal_used_queue().unwrap_or_default(); + self.signal_used_queue(queue_index).unwrap_or_default(); } } else { warn!( @@ -236,7 +245,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(false); ctx.signal_txq_event(); @@ -253,7 +262,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(true); ctx.signal_txq_event(); @@ -269,7 +278,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(false); ctx.device.backend.set_tx_err(Some(VsockError::NoData)); @@ -285,7 +294,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); // Invalidate the descriptor chain, by setting its length to 0. ctx.guest_txvq.dtable[0].len.set(0); @@ -302,7 +311,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); assert!(!ctx.device.handle_txq_event(EventSet::IN)); } @@ -317,7 +326,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(true); ctx.device.backend.set_rx_err(Some(VsockError::NoData)); @@ -334,7 +343,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(true); ctx.signal_rxq_event(); @@ -347,7 +356,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); // Invalidate the descriptor chain, by setting its length to 0. ctx.guest_rxvq.dtable[0].len.set(0); @@ -363,7 +372,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(false); assert!(!ctx.device.handle_rxq_event(EventSet::IN)); } @@ -388,7 +397,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(true); ctx.device.notify_backend(EventSet::IN); @@ -407,7 +416,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), None); ctx.device.backend.set_pending_rx(false); ctx.device.notify_backend(EventSet::IN); @@ -568,7 +577,7 @@ mod tests { vsock .lock() .unwrap() - .activate(test_ctx.mem.clone()) + .activate(test_ctx.mem.clone(), None) .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); diff --git a/src/vmm/src/devices/virtio/vsock/test_utils.rs b/src/vmm/src/devices/virtio/vsock/test_utils.rs index 4a5fdb2c941..40f8285275d 100644 --- a/src/vmm/src/devices/virtio/vsock/test_utils.rs +++ b/src/vmm/src/devices/virtio/vsock/test_utils.rs @@ -5,12 +5,13 @@ #![doc(hidden)] use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::Arc; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; use super::packet::{VsockPacketRx, VsockPacketTx}; -use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::device::{VirtioDevice, VirtioInterrupt}; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::VirtQueue as GuestQ; use crate::devices::virtio::vsock::device::{RXQ_INDEX, TXQ_INDEX}; @@ -191,9 +192,9 @@ pub struct EventHandlerContext<'a> { } impl<'a> EventHandlerContext<'a> { - pub fn mock_activate(&mut self, mem: GuestMemoryMmap) { + pub fn mock_activate(&mut self, mem: GuestMemoryMmap, interrupt: Option>) { // Artificially activate the device. - self.device.activate(mem).unwrap(); + self.device.activate(mem, interrupt).unwrap(); } pub fn signal_txq_event(&mut self) { From 5764e5bcbe3d4a52193f50a11820e3e4f9213a48 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 14:30:19 +0100 Subject: [PATCH 18/22] virtio-pci: fix bug in queue initialization size=0 means the queue is not enabled in PCI spec. It doesn't matter what it is initialized to in mmio spec as the driver will write a value here during initialization. Signed-off-by: Riccardo Mancini --- src/vmm/src/devices/virtio/queue.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index b80b2571c12..84500a024e3 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -265,7 +265,7 @@ impl Queue { pub fn new(max_size: u16) -> Queue { Queue { max_size, - size: 0, + size: max_size, ready: false, desc_table_address: GuestAddress(0), avail_ring_address: GuestAddress(0), @@ -692,6 +692,18 @@ impl Queue { new - used_event - Wrapping(1) < new - old } + + pub(crate) fn reset(&mut self) { + self.ready = false; + self.size = self.max_size; + self.desc_table_address = GuestAddress(0); + self.avail_ring_address = GuestAddress(0); + self.used_ring_address = GuestAddress(0); + self.next_avail = Wrapping(0); + self.next_used = Wrapping(0); + self.num_added = Wrapping(0); + self.uses_notif_suppression = false; + } } #[cfg(kani)] From 702bbaf76a2a2499af9b2b403b38df4713a0e7c2 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 14:31:27 +0100 Subject: [PATCH 19/22] virtio-pci: introduce VirtioPciDevice and wire them up to the builder This is hardcoding to use PCI, but it works. We should make this configurable ofc. Signed-off-by: Riccardo Mancini --- src/vmm/src/builder.rs | 132 ++- src/vmm/src/devices/bus.rs | 19 +- src/vmm/src/devices/virtio/transport/mod.rs | 4 +- .../virtio/transport/pci_common_config.rs | 409 +++++++ .../devices/virtio/transport/pci_device.rs | 1016 +++++++++++++++++ src/vmm/src/lib.rs | 2 + 6 files changed, 1562 insertions(+), 20 deletions(-) create mode 100644 src/vmm/src/devices/virtio/transport/pci_common_config.rs create mode 100644 src/vmm/src/devices/virtio/transport/pci_device.rs diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index e3142836b37..cb5d70d0f8f 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -59,6 +59,7 @@ use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::RTCDevice; use crate::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; use crate::devices::pci_segment::PciSegment; +use crate::devices::virtio::transport::VirtioPciDevice; use pci::{PciBus, PciConfigIo, PciConfigMmio, PciRoot}; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; @@ -321,7 +322,7 @@ fn add_vfio_device( allocator.clone(), pci_device_bdf.into() ).unwrap(); - + // Register DMA mapping in IOMMU. for (_index, region) in memory.iter().enumerate() { info!( @@ -452,7 +453,8 @@ fn create_vmm_and_vcpus( // pci_irq_slots[i] = irqs[i % 8] as u8; // } let pci_irq_slots: [u8; 32] = [(NUM_IOAPIC_PINS-1) as u8; 32]; - + + // For x86_64 we need to create the interrupt controller before calling `KVM_CREATE_VCPUS` // while on aarch64 we need to do it the other way around. @@ -504,16 +506,19 @@ fn create_vmm_and_vcpus( pio_device_manager.register_devices(vm.fd()).unwrap(); - add_vfio_device( - Arc::clone(&vm_fd), - device_fd, - &pci_segment, - &mut mmio_device_manager, - &mut pio_device_manager, - Arc::clone(&msi_interrupt_manager), - guest_memory.clone(), - Arc::clone(&allocator) - ); + // // Create passthru device for a GPU. + // let device_fd = create_passthrough_device(vm.fd()); + + // add_vfio_device( + // Arc::clone(&vm_fd), + // device_fd, + // &pci_segment, + // &mut mmio_device_manager, + // &mut pio_device_manager, + // Arc::clone(&msi_interrupt_manager), + // guest_memory.clone(), + // Arc::clone(&allocator) + // ); // On aarch64, the vCPUs need to be created (i.e call KVM_CREATE_VCPU) before setting up the // IRQ chip because the `KVM_CREATE_VCPU` ioctl will return error if the IRQCHIP @@ -541,6 +546,8 @@ fn create_vmm_and_vcpus( pio_device_manager, acpi_device_manager, pci_segment, + msi_interrupt_manager, + allocator, }; Ok((vmm, vcpus)) @@ -1192,6 +1199,69 @@ fn attach_virtio_device( .map(|_| ()) } +fn attach_virtio_pci_device( + event_manager: &mut EventManager, + vmm: &mut Vmm, + id: String, + device: Arc>, + cmdline: &mut LoaderKernelCmdline, + is_vhost_user: bool, +) -> Result<(), StartMicrovmError>{ + event_manager.add_subscriber(device.clone()); + + let pci_segment_id = vmm.pci_segment.id; + let pci_device_bdf = vmm.pci_segment.next_device_bdf().map_err(|_| StartMicrovmError::Unknown)?; + + // Allows support for one MSI-X vector per queue. It also adds 1 + // as we need to take into account the dedicated vector to notify + // about a virtio config change. + let msix_num = (device.lock().unwrap().queues().len() + 1) as u16; + + let memory = vmm.guest_memory().clone(); + + let device_type = device.lock().unwrap().device_type(); + let virtio_pci_device = Arc::new(Mutex::new( + BusDevice::VirtioPciDevice(VirtioPciDevice::new( + id.clone(), + memory, + device, + msix_num, + &vmm.msi_interrupt_manager, + pci_device_bdf.into(), + // All device types *except* virtio block devices should be allocated a 64-bit bar + // The block devices should be given a 32-bit BAR so that they are easily accessible + // to firmware without requiring excessive identity mapping. + // The exception being if not on the default PCI segment. + pci_segment_id > 0 || device_type != virtio::TYPE_BLOCK, + None, + ) + .map_err(|_| StartMicrovmError::Unknown)?, + ))); + + add_pci_device( + virtio_pci_device.clone(), + &vmm.pci_segment, + &mut vmm.mmio_device_manager, + &mut vmm.pio_device_manager, + vmm.allocator.clone(), + pci_device_bdf, + ).map_err(|_| StartMicrovmError::Unknown)?; + + let bar_addr = virtio_pci_device.lock().unwrap().virtio_pci_device_ref().unwrap().config_bar_addr(); + for (i, queue_evt) in virtio_pci_device.lock().unwrap().virtio_pci_device_ref().unwrap().virtio_device().lock().unwrap().queue_events().iter().enumerate() { + const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; + const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = IoEventAddress::Mmio( + notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER) + ); + vmm.vm.fd().register_ioevent(queue_evt, &io_addr, NoDatamatch) + .map_err(MmioError::RegisterIoEvent)?; + } + + Ok(()) +} + pub(crate) fn attach_boot_timer_device( vmm: &mut Vmm, request_ts: TimestampUs, @@ -1230,7 +1300,7 @@ fn attach_entropy_device( .id() .to_string(); - attach_virtio_device( + attach_virtio_pci_device( event_manager, vmm, id, @@ -1264,7 +1334,7 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( (locked.id().to_string(), locked.is_vhost_user()) }; // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device( + attach_virtio_pci_device( event_manager, vmm, id, @@ -1285,7 +1355,7 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( for net_device in net_devices { let id = net_device.lock().expect("Poisoned lock").id().clone(); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, net_device.clone(), cmdline, false)?; + attach_virtio_pci_device(event_manager, vmm, id, net_device.clone(), cmdline, false)?; } Ok(()) } @@ -1298,7 +1368,7 @@ fn attach_unixsock_vsock_device( ) -> Result<(), StartMicrovmError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, unix_vsock.clone(), cmdline, false) + attach_virtio_pci_device(event_manager, vmm, id, unix_vsock.clone(), cmdline, false) } fn attach_balloon_device( @@ -1309,7 +1379,7 @@ fn attach_balloon_device( ) -> Result<(), StartMicrovmError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, balloon.clone(), cmdline, false) + attach_virtio_pci_device(event_manager, vmm, id, balloon.clone(), cmdline, false) } // Adds `O_NONBLOCK` to the stdout flags. @@ -1483,6 +1553,32 @@ pub mod tests { Arc::new(DummyDeviceRelocation{}), ).unwrap(); + let allocator = Arc::new(Mutex::new( + SystemAllocator::new( + #[cfg(target_arch = "x86_64")] + { + GuestAddress(0) + }, + #[cfg(target_arch = "x86_64")] + { + 1 << 16 + }, + GuestAddress(0), + mmio_address_space_size(46), + // GuestAddress(crate::arch::MEM_32BIT_DEVICES_START), + // crate::arch::MEM_32BIT_DEVICES_SIZE, + #[cfg(target_arch = "x86_64")] + vec![], + ) + .unwrap() + )); + + let msi_interrupt_manager: Arc> = + Arc::new(MsiInterruptManager::new( + Arc::clone(&allocator), + Arc::new(Mutex::new(extra_fd)), + )); + Vmm { events_observer: Some(std::io::stdin()), instance_info: InstanceInfo::default(), @@ -1498,6 +1594,8 @@ pub mod tests { pio_device_manager, acpi_device_manager, pci_segment, + msi_interrupt_manager, + allocator, } } diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index e8769b7d0a0..c278397c3e1 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -65,7 +65,7 @@ use super::legacy::RTCDevice; use super::legacy::{I8042Device, SerialDevice}; use pci::{PciConfigIo, PciConfigMmio, PciRoot}; use super::pseudo::BootTimer; -use super::virtio::mmio::MmioTransport; +use super::virtio::transport::{MmioTransport, VirtioPciDevice}; #[derive(Debug)] pub enum BusDevice { @@ -78,6 +78,7 @@ pub enum BusDevice { PioPciBus(PciConfigIo), MmioPciBus(PciConfigMmio), VfioPciDevice(VfioPciDevice), + VirtioPciDevice(VirtioPciDevice), #[cfg(test)] Dummy(DummyDevice), #[cfg(test)] @@ -189,15 +190,29 @@ impl BusDevice { _ => None, } } + pub fn virtio_pci_device_ref(&self) -> Option<&VirtioPciDevice> { + match self { + Self::VirtioPciDevice(x) => Some(x), + _ => None, + } + } + pub fn virtio_pci_device_mut(&mut self) -> Option<&mut VirtioPciDevice> { + match self { + Self::VirtioPciDevice(x) => Some(x), + _ => None, + } + } pub fn pci_device_ref(&self) -> Option<&dyn PciDevice> { match self { Self::VfioPciDevice(x) => Some(x), + Self::VirtioPciDevice(x) => Some(x), _ => None, } } pub fn pci_device_mut(&mut self) -> Option<&mut dyn PciDevice> { match self { Self::VfioPciDevice(x) => Some(x), + Self::VirtioPciDevice(x) => Some(x), _ => None, } } @@ -235,6 +250,7 @@ impl BusDevice { Self::MmioTransport(x) => x.bus_read(offset, data), Self::Serial(x) => x.bus_read(offset, data), Self::VfioPciDevice(x) => x.bus_read(base, offset, data), + Self::VirtioPciDevice(x) => x.bus_read(base, offset, data), Self::MmioPciBus(x) => x.bus_read(base, offset, data), Self::PioPciBus(x) => x.bus_read(base, offset, data), #[cfg(test)] @@ -253,6 +269,7 @@ impl BusDevice { Self::MmioTransport(x) => x.bus_write(offset, data), Self::Serial(x) => x.bus_write(offset, data), Self::VfioPciDevice(x) => x.bus_write(base, offset, data), + Self::VirtioPciDevice(x) => x.bus_write(base, offset, data), Self::MmioPciBus(x) => x.bus_write(base, offset, data), Self::PioPciBus(x) => x.bus_write(base, offset, data), #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs index 555a4a45a64..6d9df90bc08 100644 --- a/src/vmm/src/devices/virtio/transport/mod.rs +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -3,8 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 use vmm_sys_util::eventfd::EventFd; -// mod pci_common_config; -// mod pci_device; +mod pci_common_config; +mod pci_device; pub(crate) mod mmio; pub use mmio::MmioTransport; pub use pci_common_config::{VirtioPciCommonConfig, VIRTIO_PCI_COMMON_CONFIG_ID}; diff --git a/src/vmm/src/devices/virtio/transport/pci_common_config.rs b/src/vmm/src/devices/virtio/transport/pci_common_config.rs new file mode 100644 index 00000000000..c2a45a88ec1 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci_common_config.rs @@ -0,0 +1,409 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::sync::atomic::{AtomicU16, Ordering}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_memory::GuestAddress; +use crate::devices::virtio::queue::Queue; + +use crate::devices::virtio::device::VirtioDevice; + +use crate::logger::{debug, error, info, trace, warn}; +pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; + +#[derive(Clone, Serialize, Deserialize)] +pub struct VirtioPciCommonConfigState { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: u16, + pub msix_queues: Vec, +} + +/* The standard layout for the ring is a continuous chunk of memory which looks + * like this. We assume num is a power of 2. + * + * struct vring + * { + * // The actual descriptors (16 bytes each) + * struct vring_desc desc[num]; + * + * // A ring of available descriptor heads with free-running index. + * __virtio16 avail_flags; + * __virtio16 avail_idx; + * __virtio16 available[num]; + * __virtio16 used_event_idx; + * + * // Padding to the next align boundary. + * char pad[]; + * + * // A ring of used descriptor heads with free-running index. + * __virtio16 used_flags; + * __virtio16 used_idx; + * struct vring_used_elem used[num]; + * __virtio16 avail_event_idx; + * }; + * struct vring_desc { + * __virtio64 addr; + * __virtio32 len; + * __virtio16 flags; + * __virtio16 next; + * }; + * + * struct vring_avail { + * __virtio16 flags; + * __virtio16 idx; + * __virtio16 ring[]; + * }; + * + * // u32 is used here for ids for padding reasons. + * struct vring_used_elem { + * // Index of start of used descriptor chain. + * __virtio32 id; + * // Total length of the descriptor chain which was used (written to) + * __virtio32 len; + * }; +* + * Kernel header used for this reference: include/uapi/linux/virtio_ring.h + * Virtio Spec: https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html + * + */ +const VRING_DESC_ELEMENT_SIZE: usize = 16; +const VRING_AVAIL_ELEMENT_SIZE: usize = 2; +const VRING_USED_ELEMENT_SIZE: usize = 8; +pub enum VringType { + Desc, + Avail, + Used, +} + +pub fn get_vring_size(t: VringType, queue_size: u16) -> u64 { + let (length_except_ring, element_size) = match t { + VringType::Desc => (0, VRING_DESC_ELEMENT_SIZE), + VringType::Avail => (6, VRING_AVAIL_ELEMENT_SIZE), + VringType::Used => (6, VRING_USED_ELEMENT_SIZE), + }; + (length_except_ring + element_size * queue_size as usize) as u64 +} + +/// Contains the data for reading and writing the common configuration structure of a virtio PCI +/// device. +/// +/// * Registers: +/// +/// ** About the whole device. +/// le32 device_feature_select; // 0x00 // read-write +/// le32 device_feature; // 0x04 // read-only for driver +/// le32 driver_feature_select; // 0x08 // read-write +/// le32 driver_feature; // 0x0C // read-write +/// le16 msix_config; // 0x10 // read-write +/// le16 num_queues; // 0x12 // read-only for driver +/// u8 device_status; // 0x14 // read-write (driver_status) +/// u8 config_generation; // 0x15 // read-only for driver +/// +/// ** About a specific virtqueue. +/// le16 queue_select; // 0x16 // read-write +/// le16 queue_size; // 0x18 // read-write, power of 2, or 0. +/// le16 queue_msix_vector; // 0x1A // read-write +/// le16 queue_enable; // 0x1C // read-write (Ready) +/// le16 queue_notify_off; // 0x1E // read-only for driver +/// le64 queue_desc; // 0x20 // read-write +/// le64 queue_avail; // 0x28 // read-write +/// le64 queue_used; // 0x30 // read-write +pub struct VirtioPciCommonConfig { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: Arc, + pub msix_queues: Arc>>, +} + +impl VirtioPciCommonConfig { + pub fn new( + state: VirtioPciCommonConfigState + ) -> Self { + VirtioPciCommonConfig { + driver_status: state.driver_status, + config_generation: state.config_generation, + device_feature_select: state.device_feature_select, + driver_feature_select: state.driver_feature_select, + queue_select: state.queue_select, + msix_config: Arc::new(AtomicU16::new(state.msix_config)), + msix_queues: Arc::new(Mutex::new(state.msix_queues)), + } + } + + fn state(&self) -> VirtioPciCommonConfigState { + VirtioPciCommonConfigState { + driver_status: self.driver_status, + config_generation: self.config_generation, + device_feature_select: self.device_feature_select, + driver_feature_select: self.driver_feature_select, + queue_select: self.queue_select, + msix_config: self.msix_config.load(Ordering::Acquire), + msix_queues: self.msix_queues.lock().unwrap().clone(), + } + } + + pub fn read( + &mut self, + offset: u64, + data: &mut [u8], + device: Arc>, + ) { + assert!(data.len() <= 8); + + match data.len() { + 1 => { + let v = self.read_common_config_byte(offset); + data[0] = v; + } + 2 => { + let v = self.read_common_config_word(offset, device.lock().unwrap().queues()); + LittleEndian::write_u16(data, v); + } + 4 => { + let v = self.read_common_config_dword(offset, device); + LittleEndian::write_u32(data, v); + } + 8 => { + let v = self.read_common_config_qword(offset); + LittleEndian::write_u64(data, v); + } + _ => error!("invalid data length for virtio read: len {}", data.len()), + } + } + + pub fn write( + &mut self, + offset: u64, + data: &[u8], + device: Arc>, + ) { + assert!(data.len() <= 8); + + match data.len() { + 1 => self.write_common_config_byte(offset, data[0]), + 2 => self.write_common_config_word(offset, LittleEndian::read_u16(data), device.lock().unwrap().queues_mut()), + 4 => { + self.write_common_config_dword(offset, LittleEndian::read_u32(data), device) + } + 8 => self.write_common_config_qword(offset, LittleEndian::read_u64(data), device.lock().unwrap().queues_mut()), + _ => error!("invalid data length for virtio write: len {}", data.len()), + } + } + + fn read_common_config_byte(&self, offset: u64) -> u8 { + debug!("read_common_config_byte: offset 0x{:x}", offset); + // The driver is only allowed to do aligned, properly sized access. + match offset { + 0x14 => self.driver_status, + 0x15 => self.config_generation, + _ => { + warn!("invalid virtio config byte read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_byte(&mut self, offset: u64, value: u8) { + debug!("write_common_config_byte: offset 0x{:x}", offset); + match offset { + 0x14 => self.driver_status = value, + _ => { + warn!("invalid virtio config byte write: 0x{:x}", offset); + } + } + } + + fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { + debug!("read_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.load(Ordering::Acquire), + 0x12 => queues.len() as u16, // num_queues + 0x16 => self.queue_select, + 0x18 => self.with_queue(queues, |q| q.size).unwrap_or(0), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize], + 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), + 0x1e => self.queue_select, // notify_off + _ => { + warn!("invalid virtio register word read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { + debug!("write_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.store(value, Ordering::Release), + 0x16 => self.queue_select = value, + 0x18 => self.with_queue_mut(queues, |q| q.size = (value & 0xffff) as u16), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize] = value, + 0x1c => self.with_queue_mut(queues, |q| { + q.ready = value == 1; + }), + _ => { + warn!("invalid virtio register word write: 0x{:x}", offset); + } + } + } + + fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { + debug!("read_common_config_dword: offset 0x{:x}", offset); + match offset { + 0x00 => self.device_feature_select, + 0x04 => { + let locked_device = device.lock().unwrap(); + // Only 64 bits of features (2 pages) are defined for now, so limit + // device_feature_select to avoid shifting by 64 or more bits. + if self.device_feature_select < 2 { + (locked_device.avail_features() >> (self.device_feature_select * 32)) as u32 + } else { + 0 + } + } + 0x08 => self.driver_feature_select, + _ => { + warn!("invalid virtio register dword read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_dword( + &mut self, + offset: u64, + value: u32, + device: Arc>, + ) { + debug!("write_common_config_dword: offset 0x{:x}", offset); + fn hi(v: &mut GuestAddress, x: u32) { + *v = (*v & 0xffff_ffff) | (u64::from(x) << 32) + } + + fn lo(v: &mut GuestAddress, x: u32) { + *v = (*v & !0xffff_ffff) | u64::from(x) + } + + let mut locked_device = device.lock().unwrap(); + + match offset { + 0x00 => self.device_feature_select = value, + 0x08 => self.driver_feature_select = value, + 0x0c => locked_device.ack_features_by_page(self.driver_feature_select, value), + 0x20 => self.with_queue_mut(locked_device.queues_mut(), |q| lo(&mut q.desc_table_address, value)), + 0x24 => self.with_queue_mut(locked_device.queues_mut(), |q| hi(&mut q.desc_table_address, value)), + 0x28 => self.with_queue_mut(locked_device.queues_mut(), |q| lo(&mut q.avail_ring_address, value)), + 0x2c => self.with_queue_mut(locked_device.queues_mut(), |q| hi(&mut q.avail_ring_address, value)), + 0x30 => self.with_queue_mut(locked_device.queues_mut(), |q| lo(&mut q.used_ring_address, value)), + 0x34 => self.with_queue_mut(locked_device.queues_mut(), |q| hi(&mut q.used_ring_address, value)), + _ => { + warn!("invalid virtio register dword write: 0x{:x}", offset); + } + } + } + + fn read_common_config_qword(&self, _offset: u64) -> u64 { + debug!("read_common_config_qword: offset 0x{:x}", _offset); + 0 // Assume the guest has no reason to read write-only registers. + } + + fn write_common_config_qword(&mut self, offset: u64, value: u64, queues: &mut [Queue]) { + debug!("write_common_config_qword: offset 0x{:x}", offset); + + let low = Some((value & 0xffff_ffff) as u32); + let high = Some((value >> 32) as u32); + + match offset { + 0x20 => self.with_queue_mut(queues, |q| q.desc_table_address.0 = value), + 0x28 => self.with_queue_mut(queues, |q| q.avail_ring_address.0 = value), + 0x30 => self.with_queue_mut(queues, |q| q.used_ring_address.0 = value), + _ => { + warn!("invalid virtio register qword write: 0x{:x}", offset); + } + } + } + + fn with_queue(&self, queues: &[Queue], f: F) -> Option + where + F: FnOnce(&Queue) -> U, + { + queues.get(self.queue_select as usize).map(f) + } + + fn with_queue_mut(&self, queues: &mut [Queue], f: F) { + if let Some(queue) = queues.get_mut(self.queue_select as usize) { + f(queue); + } + } +} + +#[cfg(test)] +mod tests { + use crate::devices::virtio::transport::mmio::tests::DummyDevice; + + use super::*; + + + #[test] + fn write_base_regs() { + let mut regs = VirtioPciCommonConfig { + driver_status: 0xaa, + config_generation: 0x55, + device_feature_select: 0x0, + driver_feature_select: 0x0, + queue_select: 0xff, + msix_config: Arc::new(AtomicU16::new(0)), + msix_queues: Arc::new(Mutex::new(vec![0; 3])), + }; + + let dev = Arc::new(Mutex::new(DummyDevice::new())); + // Can set all bits of driver_status. + regs.write(0x14, &[0x55], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x14, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // The config generation register is read only. + regs.write(0x15, &[0xaa], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x15, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // Device features is read-only and passed through from the device. + regs.write(0x04, &[0, 0, 0, 0], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x04, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0u32); + + // Feature select registers are read/write. + regs.write(0x00, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x00, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + regs.write(0x08, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x08, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + + // 'queue_select' can be read and written. + regs.write(0x16, &[0xaa, 0x55], dev.clone()); + let mut read_back = vec![0x00, 0x00]; + regs.read(0x16, &mut read_back, dev); + assert_eq!(read_back[0], 0xaa); + assert_eq!(read_back[1], 0x55); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci_device.rs b/src/vmm/src/devices/virtio/transport/pci_device.rs new file mode 100644 index 00000000000..342d8a18321 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci_device.rs @@ -0,0 +1,1016 @@ +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::cmp; +use std::fmt::{Debug, Formatter}; +use std::io::Write; +use std::sync::atomic::{AtomicBool, AtomicU16, AtomicUsize, Ordering}; +use std::sync::{Arc, Barrier, Mutex}; + +use anyhow::anyhow; +use pci::{ + BarReprogrammingParams, MsixCap, MsixConfig, PciBarConfiguration, PciBarRegionType, + PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, PciDevice, PciDeviceError, + PciHeaderType, PciMassStorageSubclass, PciNetworkControllerSubclass, PciSubclass, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use crate::devices::virtio::device::{VirtioDevice, VirtioInterrupt, VirtioInterruptType}; +use crate::devices::virtio::queue::Queue; +use crate::vstate::memory::GuestMemoryMmap; +use vm_system_allocator::{AddressAllocator, SystemAllocator}; +use vm_device::dma_mapping::ExternalDmaMapping; +use vm_device::interrupt::{ + InterruptIndex, InterruptManager, InterruptSourceGroup, MsiIrqGroupConfig, +}; +use vm_device::{PciBarType, Resource}; +use vm_memory::{Address, ByteValued, GuestAddress, Le32}; +use vmm_sys_util::eventfd::EventFd; + +use super::pci_common_config::VirtioPciCommonConfigState; +use crate::devices::virtio::transport::VirtioPciCommonConfig; +use crate::logger::{debug, error}; + +const DEVICE_INIT: u32 = 0x00; +const DEVICE_ACKNOWLEDGE: u32 = 0x01; +const DEVICE_DRIVER: u32 = 0x02; +const DEVICE_DRIVER_OK: u32 = 0x04; +const DEVICE_FEATURES_OK: u32 = 0x08; +const DEVICE_FAILED: u32 = 0x80; + +const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; +const VIRTIO_F_RING_EVENT_IDX: u32 = 29; +const VIRTIO_F_VERSION_1: u32 = 32; +const VIRTIO_F_IOMMU_PLATFORM: u32 = 33; +const VIRTIO_F_IN_ORDER: u32 = 35; +const VIRTIO_F_ORDER_PLATFORM: u32 = 36; +#[allow(dead_code)] +const VIRTIO_F_SR_IOV: u32 = 37; +const VIRTIO_F_NOTIFICATION_DATA: u32 = 38; + +/// Vector value used to disable MSI for a queue. +const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; + +enum PciCapabilityType { + Common = 1, + Notify = 2, + Isr = 3, + Device = 4, + Pci = 5, + SharedMemory = 8, +} + +// This offset represents the 2 bytes omitted from the VirtioPciCap structure +// as they are already handled through add_capability(). These 2 bytes are the +// fields cap_vndr (1 byte) and cap_next (1 byte) defined in the virtio spec. +const VIRTIO_PCI_CAP_OFFSET: usize = 2; + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCap { + cap_len: u8, // Generic PCI field: capability length + cfg_type: u8, // Identifies the structure. + pci_bar: u8, // Where to find it. + id: u8, // Multiple capabilities of the same type + padding: [u8; 2], // Pad to full dword. + offset: Le32, // Offset within bar. + length: Le32, // Length of the structure, in bytes. +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap {} + +impl PciCapability for VirtioPciCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2; + +impl VirtioPciCap { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, offset: u32, length: u32) -> Self { + VirtioPciCap { + cap_len: (std::mem::size_of::() as u8) + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + } + } +} + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciNotifyCap { + cap: VirtioPciCap, + notify_off_multiplier: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciNotifyCap {} + +impl PciCapability for VirtioPciNotifyCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciNotifyCap { + pub fn new( + cfg_type: PciCapabilityType, + pci_bar: u8, + offset: u32, + length: u32, + multiplier: Le32, + ) -> Self { + VirtioPciNotifyCap { + cap: VirtioPciCap { + cap_len: (std::mem::size_of::() as u8) + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + }, + notify_off_multiplier: multiplier, + } + } +} + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCap64 { + cap: VirtioPciCap, + offset_hi: Le32, + length_hi: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap64 {} + +impl PciCapability for VirtioPciCap64 { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCap64 { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, id: u8, offset: u64, length: u64) -> Self { + VirtioPciCap64 { + cap: VirtioPciCap { + cap_len: (std::mem::size_of::() as u8) + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id, + padding: [0; 2], + offset: Le32::from(offset as u32), + length: Le32::from(length as u32), + }, + offset_hi: Le32::from((offset >> 32) as u32), + length_hi: Le32::from((length >> 32) as u32), + } + } +} + +#[allow(dead_code)] +#[repr(packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCfgCap { + cap: VirtioPciCap, + pci_cfg_data: [u8; 4], +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCfgCap {} + +impl PciCapability for VirtioPciCfgCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCfgCap { + fn new() -> Self { + VirtioPciCfgCap { + cap: VirtioPciCap::new(PciCapabilityType::Pci, 0, 0, 0), + ..Default::default() + } + } +} + +#[derive(Clone, Copy, Default)] +struct VirtioPciCfgCapInfo { + offset: usize, + cap: VirtioPciCfgCap, +} + +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciVirtioSubclass { + NonTransitionalBase = 0xff, +} + +impl PciSubclass for PciVirtioSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +// Allocate one bar for the structs pointed to by the capability structures. +// As per the PCI specification, because the same BAR shares MSI-X and non +// MSI-X structures, it is recommended to use 8KiB alignment for all those +// structures. +const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000; +const COMMON_CONFIG_SIZE: u64 = 56; +const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000; +const ISR_CONFIG_SIZE: u64 = 1; +const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000; +const DEVICE_CONFIG_SIZE: u64 = 0x1000; +const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; +const NOTIFICATION_SIZE: u64 = 0x1000; +const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000; +// The size is 256KiB because the table can hold up to 2048 entries, with each +// entry being 128 bits (4 DWORDS). +const MSIX_TABLE_SIZE: u64 = 0x40000; +const MSIX_PBA_BAR_OFFSET: u64 = 0x48000; +// The size is 2KiB because the Pending Bit Array has one bit per vector and it +// can support up to 2048 vectors. +const MSIX_PBA_SIZE: u64 = 0x800; +// The BAR size must be a power of 2. +const CAPABILITY_BAR_SIZE: u64 = 0x80000; +const VIRTIO_COMMON_BAR_INDEX: usize = 0; +const VIRTIO_SHM_BAR_INDEX: usize = 2; + +const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + +const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4; +const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID. + +#[derive(Serialize, Deserialize)] +struct QueueState { + max_size: u16, + size: u16, + ready: bool, + desc_table: u64, + avail_ring: u64, + used_ring: u64, +} + +#[derive(Serialize, Deserialize)] +pub struct VirtioPciDeviceState { + device_activated: bool, + queues: Vec, + interrupt_status: usize, + cap_pci_cfg_offset: usize, + cap_pci_cfg: Vec, +} + +#[derive(Error, Debug)] +pub enum VirtioPciDeviceError { + #[error("Failed creating VirtioPciDevice: {0}")] + CreateVirtioPciDevice(#[source] anyhow::Error), +} +pub type Result = std::result::Result; + +pub struct VirtioPciDevice { + id: String, + + // PCI configuration registers. + configuration: PciConfiguration, + + // virtio PCI common configuration + common_config: VirtioPciCommonConfig, + + // MSI-X config + msix_config: Option>>, + + // Number of MSI-X vectors + msix_num: u16, + + // Virtio device reference and status + device: Arc>, + device_activated: Arc, + + // PCI interrupts. + interrupt_status: Arc, + virtio_interrupt: Option>, + interrupt_source_group: Arc, + + // Guest memory + memory: GuestMemoryMmap, + + // Settings PCI BAR + settings_bar: u8, + + // Whether to use 64-bit bar location or 32-bit + use_64bit_bar: bool, + + // Add a dedicated structure to hold information about the very specific + // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support + // the legacy/backward compatible mechanism of letting the guest access the + // other virtio capabilities without mapping the PCI BARs. This can be + // needed when the guest tries to early access the virtio configuration of + // a device. + cap_pci_cfg_info: VirtioPciCfgCapInfo, + + // Details of bar regions to free + bar_regions: Vec, + + // Optional DMA handler + dma_handler: Option>, +} + +impl Debug for VirtioPciDevice { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("VirtioPciDevice") + .field("id", &self.id) + .finish() + } +} + +impl VirtioPciDevice { + /// Constructs a new PCI transport for the given virtio device. + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msix_num: u16, + interrupt_manager: &Arc>, + pci_device_bdf: u32, + use_64bit_bar: bool, + dma_handler: Option>, + ) -> Result { + let locked_device = device.lock().unwrap(); + + let num_queues = locked_device.queues().len(); + + let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + locked_device.device_type() as u16; + + let interrupt_source_group = interrupt_manager + .create_group(MsiIrqGroupConfig { + base: 0, + count: msix_num as InterruptIndex, + }) + .map_err(|e| { + VirtioPciDeviceError::CreateVirtioPciDevice(anyhow!( + "Failed creating MSI interrupt group: {}", + e + )) + })?; + + let (msix_config, msix_config_clone) = if msix_num > 0 { + let msix_config = Arc::new(Mutex::new( + MsixConfig::new( + msix_num, + interrupt_source_group.clone(), + pci_device_bdf, + None, + ) + .unwrap(), + )); + let msix_config_clone = msix_config.clone(); + (Some(msix_config), Some(msix_config_clone)) + } else { + (None, None) + }; + + let (class, subclass) = match locked_device.device_type() { + TYPE_NET => ( + PciClassCode::NetworkController, + &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass, + ), + TYPE_BLOCK => ( + PciClassCode::MassStorage, + &PciMassStorageSubclass::MassStorage as &dyn PciSubclass, + ), + _ => ( + PciClassCode::Other, + &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass, + ), + }; + + let configuration = PciConfiguration::new( + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + 0x1, // For modern virtio-PCI devices + class, + subclass, + None, + PciHeaderType::Device, + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + msix_config_clone, + None, + ); + + let common_config = VirtioPciCommonConfig::new( + VirtioPciCommonConfigState { + driver_status: 0, + config_generation: 0, + device_feature_select: 0, + driver_feature_select: 0, + queue_select: 0, + msix_config: VIRTQ_MSI_NO_VECTOR, + msix_queues: vec![VIRTQ_MSI_NO_VECTOR; num_queues], + }, + ); + let (device_activated, interrupt_status, cap_pci_cfg_info) = (false, 0, VirtioPciCfgCapInfo::default()); + + // Dropping the MutexGuard to unlock the VirtioDevice. This is required + // in the context of a restore given the device might require some + // activation, meaning it will require locking. Dropping the lock + // prevents from a subtle deadlock. + std::mem::drop(locked_device); + + let mut virtio_pci_device = VirtioPciDevice { + id, + configuration, + common_config, + msix_config, + msix_num, + device, + device_activated: Arc::new(AtomicBool::new(device_activated)), + interrupt_status: Arc::new(AtomicUsize::new(interrupt_status)), + virtio_interrupt: None, + memory, + settings_bar: 0, + use_64bit_bar, + interrupt_source_group, + cap_pci_cfg_info, + bar_regions: vec![], + dma_handler, + }; + + if let Some(msix_config) = &virtio_pci_device.msix_config { + virtio_pci_device.virtio_interrupt = Some(Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_pci_device.common_config.msix_config.clone(), + virtio_pci_device.common_config.msix_queues.clone(), + virtio_pci_device.interrupt_source_group.clone(), + ))); + } + + Ok(virtio_pci_device) + } + + fn is_driver_ready(&self) -> bool { + let ready_bits = + (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK) as u8; + self.common_config.driver_status == ready_bits + && self.common_config.driver_status & DEVICE_FAILED as u8 == 0 + } + + /// Determines if the driver has requested the device (re)init / reset itself + fn is_driver_init(&self) -> bool { + self.common_config.driver_status == DEVICE_INIT as u8 + } + + pub fn config_bar_addr(&self) -> u64 { + self.configuration.get_bar_addr(self.settings_bar as usize) + } + + fn add_pci_capabilities( + &mut self, + settings_bar: u8, + ) -> std::result::Result<(), PciDeviceError> { + // Add pointers to the different configuration structures from the PCI capabilities. + let common_cap = VirtioPciCap::new( + PciCapabilityType::Common, + settings_bar, + COMMON_CONFIG_BAR_OFFSET as u32, + COMMON_CONFIG_SIZE as u32, + ); + self.configuration + .add_capability(&common_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let isr_cap = VirtioPciCap::new( + PciCapabilityType::Isr, + settings_bar, + ISR_CONFIG_BAR_OFFSET as u32, + ISR_CONFIG_SIZE as u32, + ); + self.configuration + .add_capability(&isr_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + // TODO(dgreid) - set based on device's configuration size? + let device_cap = VirtioPciCap::new( + PciCapabilityType::Device, + settings_bar, + DEVICE_CONFIG_BAR_OFFSET as u32, + DEVICE_CONFIG_SIZE as u32, + ); + self.configuration + .add_capability(&device_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let notify_cap = VirtioPciNotifyCap::new( + PciCapabilityType::Notify, + settings_bar, + NOTIFICATION_BAR_OFFSET as u32, + NOTIFICATION_SIZE as u32, + Le32::from(NOTIFY_OFF_MULTIPLIER), + ); + self.configuration + .add_capability(¬ify_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let configuration_cap = VirtioPciCfgCap::new(); + self.cap_pci_cfg_info.offset = self + .configuration + .add_capability(&configuration_cap) + .map_err(PciDeviceError::CapabilitiesSetup)? + + VIRTIO_PCI_CAP_OFFSET; + self.cap_pci_cfg_info.cap = configuration_cap; + + if self.msix_config.is_some() { + let msix_cap = MsixCap::new( + settings_bar, + self.msix_num, + MSIX_TABLE_BAR_OFFSET as u32, + settings_bar, + MSIX_PBA_BAR_OFFSET as u32, + ); + self.configuration + .add_capability(&msix_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + } + + self.settings_bar = settings_bar; + Ok(()) + } + + fn read_cap_pci_cfg(&mut self, offset: usize, mut data: &mut [u8]) { + let cap_slice = self.cap_pci_cfg_info.cap.as_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to read cap_pci_cfg from config space"); + return; + } + + if offset < std::mem::size_of::() { + if let Some(end) = offset.checked_add(data_len) { + // This write can't fail, offset and end are checked against config_len. + data.write_all(&cap_slice[offset..cmp::min(end, cap_len)]) + .unwrap(); + } + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.read_bar(0, bar_offset as u64, data) + } + } + + fn write_cap_pci_cfg(&mut self, offset: usize, data: &[u8]) -> Option> { + let cap_slice = self.cap_pci_cfg_info.cap.as_mut_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to write cap_pci_cfg to config space"); + return None; + } + + if offset < std::mem::size_of::() { + let (_, right) = cap_slice.split_at_mut(offset); + right[..data_len].copy_from_slice(data); + None + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.write_bar(0, bar_offset as u64, data) + } + } + + pub fn virtio_device(&self) -> Arc> { + self.device.clone() + } + + fn needs_activation(&self) -> bool { + !self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready() + } + + pub fn dma_handler(&self) -> Option<&Arc> { + self.dma_handler.as_ref() + } +} + + +pub struct VirtioInterruptMsix { + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, +} + +impl VirtioInterruptMsix { + pub fn new( + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, + ) -> Self { + VirtioInterruptMsix { + msix_config, + config_vector, + queues_vectors, + interrupt_source_group, + } + } +} + + +impl VirtioInterrupt for VirtioInterruptMsix { + fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + if vector == VIRTQ_MSI_NO_VECTOR { + return Ok(()); + } + + let config = &mut self.msix_config.lock().unwrap(); + let entry = &config.table_entries[vector as usize]; + // In case the vector control register associated with the entry + // has its first bit set, this means the vector is masked and the + // device should not inject the interrupt. + // Instead, the Pending Bit Array table is updated to reflect there + // is a pending interrupt for this specific vector. + if config.masked() || entry.masked() { + config.set_pba_bit(vector, false); + return Ok(()); + } + + self.interrupt_source_group + .trigger(vector as InterruptIndex) + } + + fn notifier(&self, int_type: VirtioInterruptType) -> Option { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + self.interrupt_source_group + .notifier(vector as InterruptIndex) + } +} + +impl PciDevice for VirtioPciDevice { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base + offset as usize >= self.cap_pci_cfg_info.offset + && base + offset as usize + data.len() + <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base + offset as usize - self.cap_pci_cfg_info.offset; + self.write_cap_pci_cfg(offset, data) + } else { + self.configuration + .write_config_register(reg_idx, offset, data); + None + } + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base >= self.cap_pci_cfg_info.offset + && base + 4 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base - self.cap_pci_cfg_info.offset; + let mut data = [0u8; 4]; + self.read_cap_pci_cfg(offset, &mut data); + u32::from_le_bytes(data) + } else { + self.configuration.read_reg(reg_idx) + } + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.configuration.detect_bar_reprogramming(reg_idx, data) + } + + fn allocate_bars( + &mut self, + _allocator: &Arc>, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> std::result::Result, PciDeviceError> { + let mut bars = Vec::new(); + let device_clone = self.device.clone(); + let device = device_clone.lock().unwrap(); + + let mut settings_bar_addr = None; + let mut use_64bit_bar = self.use_64bit_bar; + let restoring = resources.is_some(); + if let Some(resources) = resources { + for resource in resources { + if let Resource::PciBar { + index, base, type_, .. + } = resource + { + if index == VIRTIO_COMMON_BAR_INDEX { + settings_bar_addr = Some(GuestAddress(base)); + use_64bit_bar = match type_ { + PciBarType::Io => { + return Err(PciDeviceError::InvalidResource(resource)) + } + PciBarType::Mmio32 => false, + PciBarType::Mmio64 => true, + }; + break; + } + } + } + // Error out if no resource was matching the BAR id. + if settings_bar_addr.is_none() { + return Err(PciDeviceError::MissingResource); + } + } + + // Allocate the virtio-pci capability BAR. + // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 + let (virtio_pci_bar_addr, region_type) = if use_64bit_bar { + let region_type = PciBarRegionType::Memory64BitRegion; + let addr = mmio64_allocator + .allocate( + settings_bar_addr, + CAPABILITY_BAR_SIZE, + Some(CAPABILITY_BAR_SIZE), + ) + .ok_or(PciDeviceError::IoAllocationFailed(CAPABILITY_BAR_SIZE))?; + (addr, region_type) + } else { + let region_type = PciBarRegionType::Memory32BitRegion; + let addr = mmio32_allocator + .allocate( + settings_bar_addr, + CAPABILITY_BAR_SIZE, + Some(CAPABILITY_BAR_SIZE), + ) + .ok_or(PciDeviceError::IoAllocationFailed(CAPABILITY_BAR_SIZE))?; + (addr, region_type) + }; + + let bar = PciBarConfiguration::default() + .set_index(VIRTIO_COMMON_BAR_INDEX) + .set_address(virtio_pci_bar_addr.raw_value()) + .set_size(CAPABILITY_BAR_SIZE) + .set_region_type(region_type); + + // The creation of the PCI BAR and its associated capabilities must + // happen only during the creation of a brand new VM. When a VM is + // restored from a known state, the BARs are already created with the + // right content, therefore we don't need to go through this codepath. + if !restoring { + self.configuration.add_pci_bar(&bar).map_err(|e| { + PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr.raw_value(), e) + })?; + + // Once the BARs are allocated, the capabilities can be added to the PCI configuration. + self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX as u8)?; + } + + bars.push(bar); + + self.bar_regions.clone_from(&bars); + + Ok(bars) + } + + fn free_bars( + &mut self, + _allocator: &mut SystemAllocator, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> std::result::Result<(), PciDeviceError> { + for bar in self.bar_regions.drain(..) { + match bar.region_type() { + PciBarRegionType::Memory32BitRegion => { + mmio32_allocator.free(GuestAddress(bar.addr()), bar.size()); + } + PciBarRegionType::Memory64BitRegion => { + mmio64_allocator.free(GuestAddress(bar.addr()), bar.size()); + } + _ => error!("Unexpected PCI bar type"), + } + } + Ok(()) + } + + fn move_bar( + &mut self, + old_base: u64, + new_base: u64, + ) -> std::result::Result<(), std::io::Error> { + // We only update our idea of the bar in order to support free_bars() above. + // The majority of the reallocation is done inside DeviceManager. + for bar in self.bar_regions.iter_mut() { + if bar.addr() == old_base { + *bar = bar.set_address(new_base); + } + } + + Ok(()) + } + + fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.read( + o - COMMON_CONFIG_BAR_OFFSET, + data, + self.device.clone(), + ), + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.get_mut(0) { + // Reading this register resets it to 0. + *v = self.interrupt_status.swap(0, Ordering::AcqRel) as u8; + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let device = self.device.lock().unwrap(); + device.read_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + } + } + + fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => self.common_config.write( + o - COMMON_CONFIG_BAR_OFFSET, + data, + self.device.clone(), + ), + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.first() { + self.interrupt_status + .fetch_and(!(*v as usize), Ordering::AcqRel); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let mut device = self.device.lock().unwrap(); + device.write_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + #[cfg(feature = "sev_snp")] + for (_event, _addr) in self.ioeventfds(_base) { + if _addr == _base + offset { + _event.write(1).unwrap(); + } + } + // Handled with ioeventfds. + #[cfg(not(feature = "sev_snp"))] + error!("Unexpected write to notification BAR: offset = 0x{:x}", o); + + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + }; + + // Try and activate the device if the driver status has changed + if self.needs_activation() { + debug!("Activating device"); + self.virtio_device().lock().unwrap().activate(self.memory.clone(), self.virtio_interrupt.as_ref().map(Arc::clone)) + .unwrap_or_else(|err| error!("Error activating device: {err:?}")); + } else { + debug!("Device doesn't need activation"); + } + + // Device has been reset by the driver + if self.device_activated.load(Ordering::SeqCst) && self.is_driver_init() { + let mut device = self.device.lock().unwrap(); + let reset_result = device.reset(); + match reset_result { + Some((virtio_interrupt, mut _queue_evts)) => { + // Upon reset the device returns its interrupt EventFD + self.virtio_interrupt = Some(virtio_interrupt); + self.device_activated.store(false, Ordering::SeqCst); + + // Reset queue readiness (changes queue_enable), queue sizes + // and selected_queue as per spec for reset + self.virtio_device().lock().unwrap().queues_mut().iter_mut().for_each(Queue::reset); + self.common_config.queue_select = 0; + } + None => { + error!("Attempt to reset device when not implemented in underlying device"); + self.common_config.driver_status = DEVICE_FAILED as u8; + } + } + } + + None + } + + fn as_any(&mut self) -> &mut dyn Any { + self + } + + fn id(&self) -> Option { + Some(self.id.clone()) + } +} + +impl VirtioPciDevice { + pub fn bus_read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + pub fn bus_write(&mut self, base: u64, offset: u64, data: &[u8]) { + self.write_bar(base, offset, data); + } +} \ No newline at end of file diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 4c819256f8f..e58f858a8ea 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -338,6 +338,8 @@ pub struct Vmm { pio_device_manager: PortIODeviceManager, acpi_device_manager: ACPIDeviceManager, pci_segment: PciSegment, + msi_interrupt_manager: Arc>, + allocator: Arc>, } impl Vmm { From a675c7eaf56663acc4b6a90387825c96dbae0c61 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 18:14:53 +0100 Subject: [PATCH 20/22] virtio-pci: fix device relocation There was a bug that made the ioeventfd notification be put back on the same address it was removed from. TODO: check if the use of the mmio 64b allocator is correct Signed-off-by: Riccardo Mancini --- src/vmm/src/lib.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index e58f858a8ea..11f81281f85 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -1068,14 +1068,12 @@ impl DeviceRelocation for AddressManager { error!("I/O region is not supported"); } PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { - let allocators = if region_type == PciBarRegionType::Memory32BitRegion { - &self.pci_mmio32_allocators - } else { - &self.pci_mmio64_allocators - }; + let allocators = self.pci_mmio32_allocators + .iter() + .chain(self.pci_mmio64_allocators.iter()); // Find the specific allocator that this BAR was allocated from and use it for new one - for allocator in allocators { + for allocator in allocators.clone() { let allocator_base = allocator.lock().unwrap().base(); let allocator_end = allocator.lock().unwrap().end(); @@ -1122,9 +1120,9 @@ impl DeviceRelocation for AddressManager { const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. - let notify_base = old_base + NOTIFICATION_BAR_OFFSET; + let old_notify_base = old_base + NOTIFICATION_BAR_OFFSET; for (i, queue_evt) in virtio_pci_dev.virtio_device().lock().unwrap().queue_events().iter().enumerate() { - let addr = notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER); + let addr = old_notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER); let io_addr = IoEventAddress::Mmio(addr); self.vm.lock().unwrap().unregister_ioevent(queue_evt, &io_addr, NoDatamatch).map_err(|e| { io::Error::new( @@ -1133,8 +1131,9 @@ impl DeviceRelocation for AddressManager { ) })?; } + let new_notify_base = new_base + NOTIFICATION_BAR_OFFSET; for (i, queue_evt) in virtio_pci_dev.virtio_device().lock().unwrap().queue_events().iter().enumerate() { - let addr = notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER); + let addr = new_notify_base + i as u64 * u64::from(NOTIFY_OFF_MULTIPLIER); let io_addr = IoEventAddress::Mmio(addr); self.vm.lock().unwrap() .register_ioevent(queue_evt, &io_addr, NoDatamatch) From ea7932854bc29e426f255171b5486ae1a52ce0d9 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 18:17:49 +0100 Subject: [PATCH 21/22] virtio-pci: allow configuration of pci from config json - pci can be enabled/disabled for all mmio devices - if pci is enabled, multiple vfio devices can be added only one vfio device works. multiple tigger errno 16: device or resource busy Signed-off-by: Riccardo Mancini --- src/vmm/src/acpi/mod.rs | 19 ++- src/vmm/src/builder.rs | 313 ++++++++++++++++++---------------- src/vmm/src/lib.rs | 9 +- src/vmm/src/resources.rs | 11 ++ src/vmm/src/vmm_config/mod.rs | 2 + src/vmm/src/vmm_config/pci.rs | 13 ++ 6 files changed, 210 insertions(+), 157 deletions(-) create mode 100644 src/vmm/src/vmm_config/pci.rs diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index dc47a28141b..1527c8874f6 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -79,7 +79,7 @@ impl<'a> AcpiTableWriter<'a> { fn build_dsdt( &mut self, mmio_device_manager: &MMIODeviceManager, - pci_segment: &PciSegment, + pci_segment: Option<&PciSegment>, acpi_device_manager: &ACPIDeviceManager, ) -> Result { let mut dsdt_data = Vec::new(); @@ -93,7 +93,9 @@ impl<'a> AcpiTableWriter<'a> { // Architecture specific DSDT data setup_arch_dsdt(&mut dsdt_data); - pci_segment.append_aml_bytes(&mut dsdt_data); + if let Some(pci_segment) = pci_segment { + pci_segment.append_aml_bytes(&mut dsdt_data); + } let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); self.write_acpi_table(&mut dsdt) @@ -130,12 +132,17 @@ impl<'a> AcpiTableWriter<'a> { /// Build the XSDT table for the guest /// /// Currently, we pass to the guest just FADT and MADT tables. - fn build_xsdt(&mut self, fadt_addr: u64, madt_addr: u64, mcfg_addr: u64) -> Result { + fn build_xsdt(&mut self, fadt_addr: u64, madt_addr: u64, mcfg_addr: Option) -> Result { + let tables = if let Some(mcfg_addr) = mcfg_addr { + vec![fadt_addr, madt_addr, mcfg_addr] + } else { + vec![fadt_addr, madt_addr] + }; let mut xsdt = Xsdt::new( OEM_ID, *b"FCMVXSDT", OEM_REVISION, - vec![fadt_addr, madt_addr, mcfg_addr], + tables, ); self.write_acpi_table(&mut xsdt) } @@ -181,7 +188,7 @@ pub(crate) fn create_acpi_tables( resource_allocator: &mut ResourceAllocator, mmio_device_manager: &MMIODeviceManager, acpi_device_manager: &ACPIDeviceManager, - pci_segment: &PciSegment, + pci_segment: Option<&PciSegment>, pci_mmio_config_addr: u64, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { @@ -193,7 +200,7 @@ pub(crate) fn create_acpi_tables( let dsdt_addr = writer.build_dsdt(mmio_device_manager, pci_segment, acpi_device_manager)?; let fadt_addr = writer.build_fadt(dsdt_addr)?; let madt_addr = writer.build_madt(vcpus.len().try_into().unwrap())?; - let mcfg_addr = writer.build_mcfg(pci_mmio_config_addr)?; + let mcfg_addr = pci_segment.map(|_| writer.build_mcfg(pci_mmio_config_addr)).transpose()?; let xsdt_addr = writer.build_xsdt(fadt_addr, madt_addr, mcfg_addr)?; writer.build_rsdp(xsdt_addr) } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index cb5d70d0f8f..542ad724937 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -23,7 +23,7 @@ use linux_loader::loader::elf::Elf as Loader; #[cfg(target_arch = "aarch64")] use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::KernelLoader; -use pci::{DeviceRelocation, PciBarConfiguration, PciBarRegionType, PciBdf, PciDevice, VfioPciDevice}; +use pci::{DeviceRelocation, PciBarConfiguration, PciBarRegionType, PciBdf, PciDevice, VfioPciDevice, VfioPciError}; use seccompiler::BpfThreadMap; use userfaultfd::Uffd; use utils::time::TimestampUs; @@ -150,6 +150,10 @@ pub enum StartMicrovmError { /// Error cloning Vcpu fds #[cfg(feature = "gdb")] VcpuFdCloneError(#[from] crate::vstate::vcpu::CopyKvmFdError), + /// Error creating Vfio device + VfioError(vfio_ioctls::VfioError), + /// Error setting up Vfio PCI device + VfioPciError(VfioPciError), /// TODO Unknown, } @@ -242,15 +246,11 @@ fn add_pci_device( } fn add_vfio_device( - vm: Arc>, + vmm: &mut Vmm, fd: DeviceFd, - pci_segment: &PciSegment, - dev_manager: &mut MMIODeviceManager, - pio_manager: &mut PortIODeviceManager, - interrupt_manager: Arc>, - memory: GuestMemoryMmap, - allocator: Arc> -) { + device_path: &Path, +) -> Result<(), StartMicrovmError>{ + let pci_segment = vmm.pci_segment.as_ref().expect("pci should be enabled"); // We need to shift the device id since the 3 first bits // are dedicated to the PCI function, and we know we don't @@ -280,29 +280,24 @@ fn add_vfio_device( // 3. The conversion here extracts the raw fd and then turns the raw fd into a DeviceFd // of the same (correct) type. let vfio_container = Arc::new( - VfioContainer::new(Some(Arc::new(VfioDeviceFd::new_from_kvm(unsafe { DeviceFd::from_raw_fd(dup_device_fd) })))).unwrap(), + VfioContainer::new(Some(Arc::new(VfioDeviceFd::new_from_kvm(unsafe { DeviceFd::from_raw_fd(dup_device_fd) })))) + .map_err(StartMicrovmError::VfioError)?, ); - let vfio_device = VfioDevice::new( - // T4 GPU on g4dn.metal intance. - // Path::new("/sys/bus/pci/drivers/vfio-pci/0000:18:00.0"), - Path::new("/sys/bus/pci/devices/0000:18:00.0/"), - // Path::new("/sys/bus/pci/drivers/vfio-pci/0000:bf:00.1"), - Arc::clone(&vfio_container), - ) - .unwrap(); - + let vfio_device = VfioDevice::new(device_path, Arc::clone(&vfio_container)) + .map_err(StartMicrovmError::VfioError)?; let vfio_pci_device = BusDevice::VfioPciDevice(VfioPciDevice::new( pci_device_id.to_string(), - vm, + vmm.extra_fd.as_ref().expect("pci should be enabled").clone(), vfio_device, vfio_container.clone(), - interrupt_manager, + vmm.msi_interrupt_manager.as_ref().expect("pci should be enabled").clone(), None, false, pci_device_bdf.into(), Arc::new(move || { + // TODO use allocator for memory slots static mut CURRENT: u32 = 1; unsafe { CURRENT += 1; @@ -317,14 +312,14 @@ fn add_vfio_device( add_pci_device( vfio_pci_device.clone(), pci_segment, - dev_manager, - pio_manager, - allocator.clone(), + &mut vmm.mmio_device_manager, + &mut vmm.pio_device_manager, + vmm.allocator.as_ref().expect("pci should be enabled").clone(), pci_device_bdf.into() ).unwrap(); // Register DMA mapping in IOMMU. - for (_index, region) in memory.iter().enumerate() { + for (_index, region) in vmm.guest_memory.iter().enumerate() { info!( "Mapping DMA for {:x} len {:x} at hva {:x}", region.start_addr().0, @@ -340,15 +335,9 @@ fn add_vfio_device( region.len() as u64, // memory.get_host_address(region.start_addr()).unwrap() as u64, region.as_ptr() as u64 - ).unwrap(); - // vfio_container.vfio_dma_map( - // region.start_addr().0, - // region.len() as u64, - // memory.get_host_address(region.start_addr()).unwrap() as u64, - // ) + ).map_err(StartMicrovmError::VfioPciError)?; } - - // Need to register bus mappings ? + Ok(()) } // The MMIO address space size is subtracted with 64k. This is done for the @@ -369,6 +358,7 @@ fn create_vmm_and_vcpus( track_dirty_pages: bool, vcpu_count: u8, kvm_capabilities: Vec, + pci_enabled: bool, ) -> Result<(Vmm, Vec), StartMicrovmError> { use self::StartMicrovmError::*; @@ -385,46 +375,6 @@ fn create_vmm_and_vcpus( .map_err(VmmError::EventFd) .map_err(Internal)?; - // Create a system resources allocator. - const NUM_IOAPIC_PINS: usize = 24; - const X86_64_IRQ_BASE: u32 = 5; - - let allocator = Arc::new(Mutex::new( - SystemAllocator::new( - #[cfg(target_arch = "x86_64")] - { - GuestAddress(0) - }, - #[cfg(target_arch = "x86_64")] - { - 1 << 16 - }, - GuestAddress(0), - mmio_address_space_size(46), - // GuestAddress(crate::arch::MEM_32BIT_DEVICES_START), - // crate::arch::MEM_32BIT_DEVICES_SIZE, - #[cfg(target_arch = "x86_64")] - vec![GsiApic::new( - X86_64_IRQ_BASE, - NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, - )], - ) - .unwrap() - )); - - let vm_fd = Arc::new(Mutex::new(extra_fd)); - // First we create the MSI interrupt manager, the legacy one is created - // later, after the IOAPIC device creation. - // The reason we create the MSI one first is because the IOAPIC needs it, - // and then the legacy interrupt manager needs an IOAPIC. So we're - // handling a linear dependency chain: - // msi_interrupt_manager <- IOAPIC <- legacy_interrupt_manager. - let msi_interrupt_manager: Arc> = - Arc::new(MsiInterruptManager::new( - Arc::clone(&allocator), - Arc::clone(&vm_fd), - )); - let resource_allocator = ResourceAllocator::new()?; // Instantiate the MMIO device manager. @@ -433,29 +383,6 @@ fn create_vmm_and_vcpus( // Instantiate ACPI device manager. let acpi_device_manager = ACPIDeviceManager::new(); - // alignment 4 << 10 - let pci_mmio32_allocator = Arc::new(Mutex::new( - AddressAllocator::new(GuestAddress(MEM_32BIT_DEVICES_START), MEM_32BIT_DEVICES_SIZE).unwrap(), - )); - - // alignment 4 << 30 - let pci_mmio64_allocator = Arc::new(Mutex::new( - AddressAllocator::new( - GuestAddress(0), - mmio_address_space_size(46), - ).unwrap() - )); - - // TODO: allocate GSI for legacy interrupts - // let irqs = resource_allocator.allocate_gsi(8).unwrap(); - // let mut pci_irq_slots: [u8; 32] = [0; 32]; - // for i in 0..32 { - // pci_irq_slots[i] = irqs[i % 8] as u8; - // } - let pci_irq_slots: [u8; 32] = [(NUM_IOAPIC_PINS-1) as u8; 32]; - - - // For x86_64 we need to create the interrupt controller before calling `KVM_CREATE_VCPUS` // while on aarch64 we need to do it the other way around. #[cfg(target_arch = "x86_64")] @@ -481,44 +408,97 @@ fn create_vmm_and_vcpus( (vcpus, pio_dev_mgr) }; - - let address_manager = Arc::new(AddressManager{ - allocator: allocator.clone(), - io_bus: Arc::new(pio_device_manager.io_bus.clone()), - mmio_bus: Arc::new(mmio_device_manager.bus.clone()), - vm: vm_fd, - pci_mmio32_allocators: vec!(pci_mmio32_allocator.clone()), - pci_mmio64_allocators: vec!(pci_mmio64_allocator.clone()), - }); - let pci_segment = PciSegment::new( - 0, - 0, - pci_mmio32_allocator, - pci_mmio64_allocator, - &mut mmio_device_manager.bus, - &pci_irq_slots, - address_manager, - ).unwrap(); + let (pci_segment, msi_interrupt_manager, allocator, extra_fd) = if pci_enabled { - let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&pci_segment.pci_bus))))); - pio_device_manager.put_pci_bus(pci_config_io); - pio_device_manager.register_devices(vm.fd()).unwrap(); + // Create a system resources allocator. + // TODO: use ResourceAllocator + const NUM_IOAPIC_PINS: usize = 24; + const X86_64_IRQ_BASE: u32 = 5; + + let allocator = Arc::new(Mutex::new( + SystemAllocator::new( + #[cfg(target_arch = "x86_64")] + { + GuestAddress(0) + }, + #[cfg(target_arch = "x86_64")] + { + 1 << 16 + }, + GuestAddress(0), + mmio_address_space_size(46), + // GuestAddress(crate::arch::MEM_32BIT_DEVICES_START), + // crate::arch::MEM_32BIT_DEVICES_SIZE, + #[cfg(target_arch = "x86_64")] + vec![GsiApic::new( + X86_64_IRQ_BASE, + NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, + )], + ) + .unwrap() + )); + let vm_fd = Arc::new(Mutex::new(extra_fd)); + // First we create the MSI interrupt manager, the legacy one is created + // later, after the IOAPIC device creation. + // The reason we create the MSI one first is because the IOAPIC needs it, + // and then the legacy interrupt manager needs an IOAPIC. So we're + // handling a linear dependency chain: + // msi_interrupt_manager <- IOAPIC <- legacy_interrupt_manager. + let msi_interrupt_manager: Arc> = + Arc::new(MsiInterruptManager::new( + Arc::clone(&allocator), + Arc::clone(&vm_fd), + )); - // // Create passthru device for a GPU. - // let device_fd = create_passthrough_device(vm.fd()); + // alignment 4 << 10 + let pci_mmio32_allocator = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(MEM_32BIT_DEVICES_START), MEM_32BIT_DEVICES_SIZE).unwrap(), + )); - // add_vfio_device( - // Arc::clone(&vm_fd), - // device_fd, - // &pci_segment, - // &mut mmio_device_manager, - // &mut pio_device_manager, - // Arc::clone(&msi_interrupt_manager), - // guest_memory.clone(), - // Arc::clone(&allocator) - // ); + // alignment 4 << 30 + let pci_mmio64_allocator = Arc::new(Mutex::new( + AddressAllocator::new( + GuestAddress(0), + mmio_address_space_size(46), + ).unwrap() + )); + + // TODO: allocate GSI for legacy interrupts + // let irqs = resource_allocator.allocate_gsi(8).unwrap(); + // let mut pci_irq_slots: [u8; 32] = [0; 32]; + // for i in 0..32 { + // pci_irq_slots[i] = irqs[i % 8] as u8; + // } + let pci_irq_slots: [u8; 32] = [(NUM_IOAPIC_PINS-1) as u8; 32]; + + let address_manager = Arc::new(AddressManager{ + allocator: allocator.clone(), + io_bus: Arc::new(pio_device_manager.io_bus.clone()), + mmio_bus: Arc::new(mmio_device_manager.bus.clone()), + vm: vm_fd.clone(), + pci_mmio32_allocators: vec!(pci_mmio32_allocator.clone()), + pci_mmio64_allocators: vec!(pci_mmio64_allocator.clone()), + }); + let pci_segment = PciSegment::new( + 0, + 0, + pci_mmio32_allocator, + pci_mmio64_allocator, + &mut mmio_device_manager.bus, + &pci_irq_slots, + address_manager, + ).unwrap(); + let pci_config_io = Arc::new(Mutex::new(BusDevice::PioPciBus(PciConfigIo::new(Arc::clone(&pci_segment.pci_bus))))); + pio_device_manager.put_pci_bus(pci_config_io); + + (Some(pci_segment), Some(msi_interrupt_manager), Some(allocator), Some(vm_fd)) + } else { + (None, None, None, None) + }; + + pio_device_manager.register_devices(vm.fd()).unwrap(); // On aarch64, the vCPUs need to be created (i.e call KVM_CREATE_VCPU) before setting up the // IRQ chip because the `KVM_CREATE_VCPU` ioctl will return error if the IRQCHIP @@ -545,6 +525,7 @@ fn create_vmm_and_vcpus( #[cfg(target_arch = "x86_64")] pio_device_manager, acpi_device_manager, + extra_fd, pci_segment, msi_interrupt_manager, allocator, @@ -593,6 +574,7 @@ pub fn build_microvm_for_boot( vm_resources.vm_config.track_dirty_pages, vm_resources.vm_config.vcpu_count, cpu_template.kvm_capabilities.clone(), + vm_resources.pci_config.as_ref().map(|x| x.enabled).unwrap_or(false), )?; #[cfg(feature = "gdb")] @@ -639,6 +621,12 @@ pub fn build_microvm_for_boot( attach_entropy_device(&mut vmm, &mut boot_cmdline, entropy, event_manager)?; } + if let Some(vfio_devices) = vm_resources.pci_config.as_ref().map(|x| x.vfio_devices.as_ref()).flatten() { + for vfio_device in vfio_devices { + attach_vfio_device(&mut vmm, Path::new(&vfio_device.path))?; + } + } + #[cfg(target_arch = "aarch64")] attach_legacy_devices_aarch64(event_manager, &mut vmm, &mut boot_cmdline).map_err(Internal)?; @@ -783,6 +771,7 @@ pub fn build_microvm_from_snapshot( vm_resources.vm_config.track_dirty_pages, vm_resources.vm_config.vcpu_count, microvm_state.vm_state.kvm_cap_modifiers.clone(), + vm_resources.pci_config.as_ref().map(|x| x.enabled).unwrap_or(false), )?; #[cfg(target_arch = "x86_64")] @@ -1146,7 +1135,7 @@ pub fn configure_system_for_boot( &mut vmm.resource_allocator, &vmm.mmio_device_manager, &vmm.acpi_device_manager, - &vmm.pci_segment, + vmm.pci_segment.as_ref(), PCI_MMCONFIG_START, vcpus, )?; @@ -1180,6 +1169,22 @@ fn attach_virtio_device( device: Arc>, cmdline: &mut LoaderKernelCmdline, is_vhost_user: bool, +) -> Result<(), StartMicrovmError> { + if vmm.pci_segment.is_some() { + attach_virtio_pci_device(event_manager, vmm, id, device) + } else { + attach_virtio_mmio_device(event_manager, vmm, id, device, cmdline, is_vhost_user) + } +} + +/// Attaches a VirtioDevice device to the device manager and event manager. +fn attach_virtio_mmio_device( + event_manager: &mut EventManager, + vmm: &mut Vmm, + id: String, + device: Arc>, + cmdline: &mut LoaderKernelCmdline, + is_vhost_user: bool, ) -> Result<(), StartMicrovmError> { use self::StartMicrovmError::*; @@ -1204,13 +1209,11 @@ fn attach_virtio_pci_device>, - cmdline: &mut LoaderKernelCmdline, - is_vhost_user: bool, ) -> Result<(), StartMicrovmError>{ event_manager.add_subscriber(device.clone()); - - let pci_segment_id = vmm.pci_segment.id; - let pci_device_bdf = vmm.pci_segment.next_device_bdf().map_err(|_| StartMicrovmError::Unknown)?; + let pci_segment = vmm.pci_segment.as_ref().expect("pci should be enabled"); + let pci_segment_id = pci_segment.id; + let pci_device_bdf = pci_segment.next_device_bdf().map_err(|_| StartMicrovmError::Unknown)?; // Allows support for one MSI-X vector per queue. It also adds 1 // as we need to take into account the dedicated vector to notify @@ -1226,7 +1229,7 @@ fn attach_virtio_pci_device>> + Debug>( (locked.id().to_string(), locked.is_vhost_user()) }; // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_pci_device( + attach_virtio_device( event_manager, vmm, id, @@ -1355,7 +1358,7 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( for net_device in net_devices { let id = net_device.lock().expect("Poisoned lock").id().clone(); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_pci_device(event_manager, vmm, id, net_device.clone(), cmdline, false)?; + attach_virtio_device(event_manager, vmm, id, net_device.clone(), cmdline, false)?; } Ok(()) } @@ -1368,7 +1371,7 @@ fn attach_unixsock_vsock_device( ) -> Result<(), StartMicrovmError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_pci_device(event_manager, vmm, id, unix_vsock.clone(), cmdline, false) + attach_virtio_device(event_manager, vmm, id, unix_vsock.clone(), cmdline, false) } fn attach_balloon_device( @@ -1379,7 +1382,20 @@ fn attach_balloon_device( ) -> Result<(), StartMicrovmError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_pci_device(event_manager, vmm, id, balloon.clone(), cmdline, false) + attach_virtio_device(event_manager, vmm, id, balloon.clone(), cmdline, false) +} + +fn attach_vfio_device( + vmm: &mut Vmm, + device_path: &Path +) -> Result<(), StartMicrovmError> { + let device_fd = create_passthrough_device(vmm.vm.fd()); + + add_vfio_device( + vmm, + device_fd, + device_path, + ) } // Adds `O_NONBLOCK` to the stdout flags. @@ -1572,11 +1588,11 @@ pub mod tests { ) .unwrap() )); - + let vm_fd = Arc::new(Mutex::new(extra_fd)); let msi_interrupt_manager: Arc> = Arc::new(MsiInterruptManager::new( Arc::clone(&allocator), - Arc::new(Mutex::new(extra_fd)), + vm_fd.clone(), )); Vmm { @@ -1593,9 +1609,10 @@ pub mod tests { #[cfg(target_arch = "x86_64")] pio_device_manager, acpi_device_manager, - pci_segment, - msi_interrupt_manager, - allocator, + extra_fd: Some(vm_fd), + pci_segment: Some(pci_segment), + msi_interrupt_manager: Some(msi_interrupt_manager), + allocator: Some(allocator), } } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 11f81281f85..3b4007063a3 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -337,9 +337,12 @@ pub struct Vmm { #[cfg(target_arch = "x86_64")] pio_device_manager: PortIODeviceManager, acpi_device_manager: ACPIDeviceManager, - pci_segment: PciSegment, - msi_interrupt_manager: Arc>, - allocator: Arc>, + + // PCI-related + extra_fd: Option>>, + pci_segment: Option, + msi_interrupt_manager: Option>>, + allocator: Option>>, } impl Vmm { diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 923225c6a8a..3b319b1aab1 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -28,6 +28,7 @@ use crate::vmm_config::metrics::{init_metrics, MetricsConfig, MetricsConfigError use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; use crate::vmm_config::vsock::*; +use crate::vmm_config::pci::PciConfig; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryMmap, MemoryError}; /// Errors encountered when configuring microVM resources. @@ -89,6 +90,8 @@ pub struct VmmConfig { #[cfg(feature = "gdb")] #[serde(rename = "gdb-socket")] gdb_socket_addr: Option, + #[serde(rename = "pci")] + pci_config: Option, } /// A data structure that encapsulates the device configurations @@ -120,6 +123,7 @@ pub struct VmResources { #[cfg(feature = "gdb")] /// Configures the location of the GDB socket pub gdb_socket_addr: Option, + pub pci_config: Option, } impl VmResources { @@ -176,6 +180,11 @@ impl VmResources { resources.set_balloon_device(balloon_config)?; } + if let Some(pci_config) = vmm_config.pci_config { + resources.pci_config = Some(pci_config.clone()); + } + + // Init the data store from file, if present. if let Some(data) = metadata_json { resources.locked_mmds_or_default().put_data( @@ -531,6 +540,7 @@ impl From<&VmResources> for VmmConfig { entropy_device: resources.entropy.config(), #[cfg(feature = "gdb")] gdb_socket_addr: resources.gdb_socket_addr.clone(), + pci_config: resources.pci_config.clone(), // TODO snapshot-restore support } } } @@ -642,6 +652,7 @@ mod tests { entropy: Default::default(), #[cfg(feature = "gdb")] gdb_socket_addr: None, + pci_config: None, } } diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index c7afc5fc65f..0e355fe2502 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -30,6 +30,8 @@ pub mod metrics; pub mod mmds; /// Wrapper for configuring the network devices attached to the microVM. pub mod net; +/// Configuration for PCI +pub mod pci; /// Wrapper for configuring microVM snapshots and the microVM state. pub mod snapshot; /// Wrapper for configuring the vsock devices attached to the microVM. diff --git a/src/vmm/src/vmm_config/pci.rs b/src/vmm/src/vmm_config/pci.rs new file mode 100644 index 00000000000..4bd4ecd67d1 --- /dev/null +++ b/src/vmm/src/vmm_config/pci.rs @@ -0,0 +1,13 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct PciConfig { + pub enabled: bool, + pub vfio_devices: Option>, +} + +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct VfioDeviceConfig { + // sysfs path of the device + pub path: String, +} \ No newline at end of file From b4f750f4ee8ed169504d20930fddbc59b142cfcf Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 25 Oct 2024 19:03:45 +0100 Subject: [PATCH 22/22] vfio-pci: support multiple devices Fix the support for multiple devices. I managed to get 8 nvidia gpus up and running inside a firecracker VM. Signed-off-by: Riccardo Mancini --- src/vmm/src/builder.rs | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 542ad724937..2f16315fb3d 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -247,8 +247,9 @@ fn add_pci_device( fn add_vfio_device( vmm: &mut Vmm, - fd: DeviceFd, + fd: &DeviceFd, device_path: &Path, + memory_slot: Arc u32 + Send + Sync>, ) -> Result<(), StartMicrovmError>{ let pci_segment = vmm.pci_segment.as_ref().expect("pci should be enabled"); @@ -296,14 +297,7 @@ fn add_vfio_device( None, false, pci_device_bdf.into(), - Arc::new(move || { - // TODO use allocator for memory slots - static mut CURRENT: u32 = 1; - unsafe { - CURRENT += 1; - CURRENT - } - }), + memory_slot, None ).unwrap()); @@ -622,8 +616,17 @@ pub fn build_microvm_for_boot( } if let Some(vfio_devices) = vm_resources.pci_config.as_ref().map(|x| x.vfio_devices.as_ref()).flatten() { + let device_fd = create_passthrough_device(vmm.vm.fd()); + let memory_slot = Arc::new(move || { + // TODO use allocator for memory slots + static mut CURRENT: u32 = 1; + unsafe { + CURRENT += 1; + CURRENT + } + }); for vfio_device in vfio_devices { - attach_vfio_device(&mut vmm, Path::new(&vfio_device.path))?; + add_vfio_device(&mut vmm, &device_fd, Path::new(&vfio_device.path), memory_slot.clone())?; } } @@ -1385,19 +1388,6 @@ fn attach_balloon_device( attach_virtio_device(event_manager, vmm, id, balloon.clone(), cmdline, false) } -fn attach_vfio_device( - vmm: &mut Vmm, - device_path: &Path -) -> Result<(), StartMicrovmError> { - let device_fd = create_passthrough_device(vmm.vm.fd()); - - add_vfio_device( - vmm, - device_fd, - device_path, - ) -} - // Adds `O_NONBLOCK` to the stdout flags. pub(crate) fn set_stdout_nonblocking() { // SAFETY: Call is safe since parameters are valid.