From 1adc89cd4ce736b6cd4dda3c394a7658d3ab576b Mon Sep 17 00:00:00 2001 From: Jon Lange Date: Sun, 15 Dec 2024 22:28:34 -0800 Subject: [PATCH] platform/native: use SIPI for API startup Native platforms that do not run under Hyper-V must use SIPI to start additional processors. This requires the use of a trampoline page in the low 1 MB of the address space to perform the transition to 64-bit code and the fully correct execution context. Signed-off-by: Jon Lange --- bootlib/src/kernel_launch.rs | 19 +++++++ igvmbuilder/src/igvm_builder.rs | 7 +++ igvmbuilder/src/main.rs | 1 + igvmbuilder/src/sipi.rs | 88 +++++++++++++++++++++++++++++++++ kernel/src/cpu/control_regs.rs | 12 +++++ kernel/src/cpu/efer.rs | 6 +++ kernel/src/cpu/idt/common.rs | 39 +++++++++++---- kernel/src/cpu/percpu.rs | 8 ++- kernel/src/cpu/smp.rs | 71 +++++++++++++++++++++++++- kernel/src/platform/native.rs | 41 ++++++++++++++- kernel/src/svsm.rs | 6 +-- 11 files changed, 279 insertions(+), 19 deletions(-) create mode 100644 igvmbuilder/src/sipi.rs diff --git a/bootlib/src/kernel_launch.rs b/bootlib/src/kernel_launch.rs index 19559ca25..edf9d8974 100644 --- a/bootlib/src/kernel_launch.rs +++ b/bootlib/src/kernel_launch.rs @@ -79,3 +79,22 @@ pub struct Stage2LaunchInfo { pub igvm_params: u32, pub _reserved: u32, } + +#[repr(C)] +#[derive(Clone, Copy, Debug)] +pub struct ApStartContext { + // All fields of this context must remain in the same order because they + // are referenced from assembly. + pub cr0: usize, + pub cr3: usize, + pub cr4: usize, + pub efer: usize, + pub start_rip: usize, + pub rsp: usize, + pub initial_rip: usize, + pub transition_cr3: u32, + pub context_size: u32, +} + +// The SIPI stub is placed immediately below the stage 2 heap are. +pub const SIPI_STUB_GPA: u32 = 0xF000; diff --git a/igvmbuilder/src/igvm_builder.rs b/igvmbuilder/src/igvm_builder.rs index 367405078..883ad5bca 100644 --- a/igvmbuilder/src/igvm_builder.rs +++ b/igvmbuilder/src/igvm_builder.rs @@ -28,6 +28,7 @@ use crate::cpuid::SnpCpuidPage; use crate::firmware::{parse_firmware, Firmware}; use crate::paging::construct_init_page_tables; use crate::platform::PlatformMask; +use crate::sipi::add_sipi_stub; use crate::stage2_stack::Stage2Stack; use crate::vmsa::{construct_native_start_context, construct_start_context, construct_vmsa}; use crate::GpaMap; @@ -527,6 +528,12 @@ impl IgvmBuilder { ); } + // If the target includes a non-isolated platform, then insert the + // SIPI startup stub. + if COMPATIBILITY_MASK.contains(ANY_NATIVE_COMPATIBILITY_MASK) { + add_sipi_stub(ANY_NATIVE_COMPATIBILITY_MASK, &mut self.directives); + } + Ok(()) } diff --git a/igvmbuilder/src/main.rs b/igvmbuilder/src/main.rs index 7423a8cbf..81755d558 100644 --- a/igvmbuilder/src/main.rs +++ b/igvmbuilder/src/main.rs @@ -18,6 +18,7 @@ mod igvm_firmware; mod ovmf_firmware; mod paging; mod platform; +mod sipi; mod stage2_stack; mod vmsa; diff --git a/igvmbuilder/src/sipi.rs b/igvmbuilder/src/sipi.rs new file mode 100644 index 000000000..eddf393c2 --- /dev/null +++ b/igvmbuilder/src/sipi.rs @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: MIT OR Apache-2.0 +// +// Copyright (c) 2024 Microsoft Corporation +// +// Author: Jon Lange + +use bootlib::kernel_launch::SIPI_STUB_GPA; +use igvm::IgvmDirectiveHeader; +use igvm_defs::{IgvmPageDataFlags, IgvmPageDataType, PAGE_SIZE_4K}; + +pub fn add_sipi_stub(compatibility_mask: u32, directives: &mut Vec) { + // The SIPI stub is the code that is required on native platforms to + // transition the processor out of real mode and into 64-bit mode when APs + // are started. It includes 16-bit code, 32-bit code, and 64-bit code. + // For simplicity, to avoid having to invoke multiple build elements to + // produce a number of separate, small code modules that are stitched + // together, this routine (somewhat awkwardly) simply just captures the + // required code bytes as a constant array, since this code is small and + // will almost never change. The assembly code and corresponding + // disassembly are listed here for reference. + // + // F000: 0F 20 C0 mov eax, cr0 + // F003: 80 C8 01 or al, 1 + // F006: 0F 22 C0 mov cr0, eax + // F009: 2E 66 0F 01 16 1A 00 lgdt cs:[001A] + // F010: EA 40 F0 08 00 jmp 0008:F040 + // F015: CC int 3 + // F016: CC int 3 + // F017: CC int 3 + // F018: CC int 3 + // F019: CC int 3 + // F01A: 1F 00 20 F0 00 00 + // + // GDT: + // F020: 00 00 00 00 00 00 00 00 // null selector + // F028: FF FF 00 00 00 9B CF 00 // 32-bit code + // F030: FF FF 00 00 00 9B AF 00 // 64-bit code + // F038: FF FF 00 00 00 93 CF 00 // data + // + // F040: 66 B8 18 00 mov ax, 18h + // F044: 8E D8 mov ds, ax + // F046: 8E D0 mov ss, ax + // F048: 8E C0 mov es, ax + // F04A: 8B 05 F8 FF 00 00 mov eax, [FFF8] // page table + // F050: 0F 22 D8 mov cr3, eax + // F053: B9 80 00 00 C0 mov ecx, C0000080h + // F058: 0F 32 rdmsr + // F05A: 0F BA E8 08 bts eax, 8 // EFER_LME + // F05E: 0F 30 wrmsr + // F060: 0F 20 E0 mov eax, cr4 + // F063: 83 C8 20 or eax, 20h // CR4_PAE + // F066: 0F 22 E0 mov cr4, eax + // F069: 0F 20 C0 mov eax, cr0 + // F06C: 0F BA E8 1F bts eax, 1Fh + // F070: 0F 22 C0 mov cr0, eax // CR0_PG + // F073: BF 00 00 01 00 mov edi, 10000 + // F078: 2B 3D FC FF 00 00 sub edi, [FFFC] // context size + // F07E: EA 85 F0 00 00 10 00 jmp 0010:F085 + // F085: FF 25 65 0F 00 00 jmp [FFF0] // start routine + // F08B: + + let code_bytes: &[u8] = &[ + 0x0F, 0x20, 0xC0, 0x80, 0xC8, 0x01, 0x0F, 0x22, 0xC0, 0x2E, 0x66, 0x0F, 0x01, 0x16, 0x1A, + 0x00, 0xEA, 0x40, 0xF0, 0x08, 0x00, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 0x1F, 0x00, 0x20, 0xF0, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0x00, + 0x9B, 0xCF, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x9B, 0xAF, 0x00, 0xFF, 0xFF, 0x00, 0x00, + 0x00, 0x93, 0xCF, 0x00, 0x66, 0xB8, 0x18, 0x00, 0x8E, 0xD8, 0x8E, 0xD0, 0x8E, 0xC0, 0x8B, + 0x05, 0xF8, 0xFF, 0x00, 0x00, 0x0F, 0x22, 0xD8, 0xB9, 0x80, 0x00, 0x00, 0xC0, 0x0F, 0x32, + 0x0F, 0xBA, 0xE8, 0x08, 0x0F, 0x30, 0x0F, 0x20, 0xE0, 0x83, 0xC8, 0x20, 0x0F, 0x22, 0xE0, + 0x0F, 0x20, 0xC0, 0x0F, 0xBA, 0xE8, 0x1F, 0x0F, 0x22, 0xC0, 0xBF, 0x00, 0x00, 0x01, 0x00, + 0x2B, 0x3D, 0xFC, 0xFF, 0x00, 0x00, 0xEA, 0x85, 0xF0, 0x00, 0x00, 0x10, 0x00, 0xFF, 0x25, + 0x65, 0x0F, 0x00, 0x00, + ]; + + let mut page_data = Vec::::new(); + page_data.extend_from_slice(code_bytes); + + // Fill the remainder of the page with INT 3. + page_data.resize(PAGE_SIZE_4K.try_into().unwrap(), 0xCC); + + directives.push(IgvmDirectiveHeader::PageData { + gpa: SIPI_STUB_GPA as u64, + compatibility_mask, + flags: IgvmPageDataFlags::new(), + data_type: IgvmPageDataType::NORMAL, + data: page_data, + }); +} diff --git a/kernel/src/cpu/control_regs.rs b/kernel/src/cpu/control_regs.rs index d8452ae8a..1dddae5e7 100644 --- a/kernel/src/cpu/control_regs.rs +++ b/kernel/src/cpu/control_regs.rs @@ -111,6 +111,12 @@ bitflags! { } } +impl From for CR0Flags { + fn from(bits: usize) -> Self { + CR0Flags::from_bits_truncate(bits as u64) + } +} + #[inline] pub fn read_cr0() -> CR0Flags { let cr0: u64; @@ -214,6 +220,12 @@ bitflags! { } } +impl From for CR4Flags { + fn from(bits: usize) -> Self { + CR4Flags::from_bits_truncate(bits as u64) + } +} + #[inline] pub fn read_cr4() -> CR4Flags { let cr4: u64; diff --git a/kernel/src/cpu/efer.rs b/kernel/src/cpu/efer.rs index 681db39a1..33d25908c 100644 --- a/kernel/src/cpu/efer.rs +++ b/kernel/src/cpu/efer.rs @@ -32,3 +32,9 @@ pub fn write_efer(efer: EFERFlags) { let val = efer.bits(); write_msr(EFER, val); } + +impl From for EFERFlags { + fn from(bits: usize) -> Self { + EFERFlags::from_bits_truncate(bits as u64) + } +} diff --git a/kernel/src/cpu/idt/common.rs b/kernel/src/cpu/idt/common.rs index 87c5d41e4..cbd6d7b97 100644 --- a/kernel/src/cpu/idt/common.rs +++ b/kernel/src/cpu/idt/common.rs @@ -20,6 +20,7 @@ use crate::types::{Bytes, SVSM_CS}; use alloc::boxed::Box; use core::arch::{asm, global_asm}; use core::mem; +use core::ops::Deref; pub const DE_VECTOR: usize = 0; pub const DB_VECTOR: usize = 1; @@ -339,6 +340,23 @@ impl IDT { self } + + /// Load an IDT. + /// # Safety + /// The caller must guarantee that the IDT lifetime must be static so that + /// its entries are always available to the CPU. + pub unsafe fn load(&self) { + let desc: IdtDesc = IdtDesc { + size: (IDT_ENTRIES * 16) as u16, + address: VirtAddr::from(self.entries.as_ptr()), + }; + + // SAFETY: Inline assembly to load an IDT. `'static` lifetime ensures + // that address is always available for the CPU. + unsafe { + asm!("lidt (%rax)", in("rax") &desc, options(att_syntax)); + } + } } impl Default for IDT { @@ -348,23 +366,24 @@ impl Default for IDT { } impl WriteLockGuard<'static, IDT> { - /// Load an IDT. Its lifetime must be static so that its entries are - /// always available to the CPU. pub fn load(&self) { - let desc: IdtDesc = IdtDesc { - size: (IDT_ENTRIES * 16) as u16, - address: VirtAddr::from(self.entries.as_ptr()), - }; - - // SAFETY: Inline assembly to load an IDT. `'static` lifetime ensures - // that address is always available for the CPU. + // SAFETY: the lifetime of the lock guard is static, so the safety + // requirement of IDT::load are met. unsafe { - asm!("lidt (%rax)", in("rax") &desc, options(att_syntax)); + self.deref().load(); } } } impl ReadLockGuard<'static, IDT> { + pub fn load(&self) { + // SAFETY: the lifetime of the lock guard is static, so the safety + // requirement of IDT::load are met. + unsafe { + self.deref().load(); + } + } + pub fn base_limit(&self) -> (u64, u16) { let base: *const IDT = core::ptr::from_ref(self); let limit = (IDT_ENTRIES * mem::size_of::()) as u16; diff --git a/kernel/src/cpu/percpu.rs b/kernel/src/cpu/percpu.rs index 0113ecce7..852752e80 100644 --- a/kernel/src/cpu/percpu.rs +++ b/kernel/src/cpu/percpu.rs @@ -876,10 +876,14 @@ impl PerCpu { Ok(()) } - pub fn load_tss(&self) { + pub fn load_gdt_tss(&self, init_gdt: bool) { // Create a temporary GDT to use to configure the TSS. let mut gdt = GDT::new(); gdt.load(); + // Load the GDT selectors if requested. + if init_gdt { + gdt.load_selectors(); + } gdt.load_tss(&self.tss); } @@ -892,7 +896,7 @@ impl PerCpu { // SAFETY: along with the page table we are also uploading the right // TSS and ISST to ensure a memory safe execution state unsafe { self.get_pgtable().load() }; - self.load_tss(); + self.load_gdt_tss(false); if is_cet_ss_supported() { self.load_isst(); } diff --git a/kernel/src/cpu/smp.rs b/kernel/src/cpu/smp.rs index 2b0c72087..89f47c8da 100644 --- a/kernel/src/cpu/smp.rs +++ b/kernel/src/cpu/smp.rs @@ -8,18 +8,22 @@ extern crate alloc; use crate::acpi::tables::ACPICPUInfo; use crate::address::Address; +use crate::cpu::idt::idt; use crate::cpu::percpu::{this_cpu, this_cpu_shared, PerCpu}; use crate::cpu::shadow_stack::{is_cet_ss_supported, SCetFlags, MODE_64BIT, S_CET}; use crate::cpu::sse::sse_init; use crate::enable_shadow_stacks; use crate::error::SvsmError; -use crate::platform::SvsmPlatform; -use crate::platform::SVSM_PLATFORM; +use crate::hyperv; +use crate::platform::{SvsmPlatform, SVSM_PLATFORM}; use crate::requests::{request_loop, request_processing_main}; use crate::task::{schedule_init, start_kernel_task}; use crate::utils::immut_after_init::immut_after_init_set_multithreaded; use alloc::string::String; +use bootlib::kernel_launch::ApStartContext; +use core::arch::global_asm; +use core::mem; fn start_cpu(platform: &dyn SvsmPlatform, apic_id: u32) -> Result<(), SvsmError> { let start_rip: u64 = (start_ap as *const u8) as u64; @@ -46,6 +50,69 @@ pub fn start_secondary_cpus(platform: &dyn SvsmPlatform, cpus: &[ACPICPUInfo]) { log::info!("Brought {} AP(s) online", count); } +#[no_mangle] +fn start_ap_setup() { + // Initialize the GDT, TSS, and IDT. + this_cpu().load_gdt_tss(true); + idt().load(); +} + +extern "C" { + fn start_ap_indirect(); +} + +global_asm!( + r#" + .globl start_ap_indirect + start_ap_indirect: + /* Load fields from the context structure */ + movq (%rdi), %r8 /* CR0 */ + movq 8(%rdi), %r9 /* CR3 */ + movq 16(%rdi), %r10 /* CR4 */ + movl 24(%rdi), %eax /* Low bits of EFER */ + movl 28(%rdi), %edx /* High bits of EFER */ + movq 32(%rdi), %r11 /* Start RIP */ + movq 40(%rdi), %rsp /* Initial RSP */ + + /* Switch to the target environment. This will remove the transition + * environment and context structure from the address space. */ + movq %r8, %cr0 + movq %r10, %cr4 + movl $0xC0000080, %ecx /* EFER */ + wrmsr + movq %r9, %cr3 + + /* Save the start RIP on the stack. */ + pushq %r11 + + /* Call a startup function to complete setup in the local + * environment. */ + call start_ap_setup + + /* Begin execution from the starting RIP, which is at the top of the + * stack. */ + ret + "#, + options(att_syntax) +); + +pub fn create_ap_start_context( + initial_context: &hyperv::HvInitialVpContext, + transition_cr3: u32, +) -> ApStartContext { + ApStartContext { + cr0: initial_context.cr0.try_into().unwrap(), + cr3: initial_context.cr3.try_into().unwrap(), + cr4: initial_context.cr4.try_into().unwrap(), + efer: initial_context.efer.try_into().unwrap(), + start_rip: initial_context.rip.try_into().unwrap(), + rsp: initial_context.rsp.try_into().unwrap(), + transition_cr3, + initial_rip: start_ap_indirect as usize, + context_size: mem::size_of::() as u32, + } +} + #[no_mangle] fn start_ap() { let percpu = this_cpu(); diff --git a/kernel/src/platform/native.rs b/kernel/src/platform/native.rs index 8d3b5b5da..003196a76 100644 --- a/kernel/src/platform/native.rs +++ b/kernel/src/platform/native.rs @@ -6,17 +6,24 @@ use crate::address::{PhysAddr, VirtAddr}; use crate::console::init_svsm_console; +use crate::cpu::apic::{ApicIcr, IcrMessageType}; +use crate::cpu::control_regs::read_cr3; use crate::cpu::cpuid::CpuidResult; use crate::cpu::msr::{read_msr, write_msr}; use crate::cpu::percpu::PerCpu; +use crate::cpu::smp::create_ap_start_context; use crate::error::SvsmError; use crate::hyperv; use crate::hyperv::{hyperv_setup_hypercalls, hyperv_start_cpu, is_hyperv_hypervisor}; use crate::io::{IOPort, DEFAULT_IO_DRIVER}; +use crate::mm::PerCPUPageMappingGuard; use crate::platform::{PageEncryptionMasks, PageStateChangeOp, PageValidateOp, SvsmPlatform}; -use crate::types::PageSize; +use crate::types::{PageSize, PAGE_SIZE}; use crate::utils::MemoryRegion; +use bootlib::kernel_launch::{ApStartContext, SIPI_STUB_GPA}; +use core::{mem, ptr}; + #[cfg(debug_assertions)] use crate::mm::virt_to_phys; @@ -32,6 +39,7 @@ const APIC_MSR_ICR: u32 = 0x830; #[derive(Clone, Copy, Debug)] pub struct NativePlatform { is_hyperv: bool, + transition_cr3: u32, } impl NativePlatform { @@ -43,6 +51,7 @@ impl NativePlatform { } Self { is_hyperv: is_hyperv_hypervisor(), + transition_cr3: u64::from(read_cr3()).try_into().unwrap(), } } } @@ -191,6 +200,34 @@ impl SvsmPlatform for NativePlatform { return hyperv_start_cpu(cpu, context); } - todo!(); + // Translate this context into an AP start context and place it it in + // the AP startup transition page. + let ap_context = create_ap_start_context(context, self.transition_cr3); + + let context_pa = PhysAddr::new(SIPI_STUB_GPA as usize); + let context_mapping = PerCPUPageMappingGuard::create_4k(context_pa)?; + + // SAFETY: the address of the transition page was made valid when the + // `PerCPUPageMappingGuard` was created. + unsafe { + let size = mem::size_of::(); + let context_va = context_mapping.virt_addr() + PAGE_SIZE - size; + let context_ptr = context_va.as_mut_ptr::(); + ptr::copy_nonoverlapping(&ap_context, context_ptr, 1); + } + + // Now that the AP startup transition page has been configured, send + // INIT-SIPI to start the processor. No second SIPI is required when + // running virtualized. + let icr = ApicIcr::new().with_destination(cpu.shared().apic_id()); + let init_icr = icr.with_message_type(IcrMessageType::Init); + self.post_irq(init_icr.into())?; + let sipi_vector = SIPI_STUB_GPA >> 12; + let sipi_icr = icr + .with_message_type(IcrMessageType::Sipi) + .with_vector(sipi_vector.try_into().unwrap()); + self.post_irq(sipi_icr.into())?; + + Ok(()) } } diff --git a/kernel/src/svsm.rs b/kernel/src/svsm.rs index bed9936ed..9701cd8c2 100755 --- a/kernel/src/svsm.rs +++ b/kernel/src/svsm.rs @@ -297,9 +297,6 @@ pub extern "C" fn svsm_main() { populate_ram_fs(LAUNCH_INFO.kernel_fs_start, LAUNCH_INFO.kernel_fs_end) .expect("Failed to unpack FS archive"); - invalidate_early_boot_memory(&**SVSM_PLATFORM, &config, launch_info) - .expect("Failed to invalidate early boot memory"); - let cpus = config.load_cpu_info().expect("Failed to load ACPI tables"); let mut nr_cpus = 0; @@ -313,6 +310,9 @@ pub extern "C" fn svsm_main() { start_secondary_cpus(&**SVSM_PLATFORM, &cpus); + invalidate_early_boot_memory(&**SVSM_PLATFORM, &config, launch_info) + .expect("Failed to invalidate early boot memory"); + if let Err(e) = SVSM_PLATFORM.prepare_fw(&config, new_kernel_region(&LAUNCH_INFO)) { panic!("Failed to prepare guest FW: {e:#?}"); }