From 9943e0d4ff196e766851d03bb81c816eb75b93c5 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 3 Sep 2024 19:10:33 +0000 Subject: [PATCH] x86/sev: Fix host kdump support for SNP With active SNP VMs, SNP_SHUTDOWN_EX invoked during panic notifiers causes crashkernel boot failure with the following signature: [ 563.497112] sysrq: Trigger a crash [ 563.508415] Kernel panic - not syncing: sysrq triggered crash [ 563.522002] CPU: 10 UID: 0 PID: 4661 Comm: bash Kdump: loaded Not tainted 6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty #61 [ 563.549762] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS RXM100AB 10/17/2022 [ 563.566266] Call Trace: [ 563.576430] [ 563.585932] dump_stack_lvl+0x2b/0x90 [ 563.597244] dump_stack+0x14/0x20 [ 563.608141] panic+0x3b9/0x400 [ 563.618801] ? srso_alias_return_thunk+0x5/0xfbef5 [ 563.631271] sysrq_handle_crash+0x19/0x20 [ 563.642696] __handle_sysrq+0xf9/0x290 [ 563.653691] ? srso_alias_return_thunk+0x5/0xfbef5 [ 563.666126] write_sysrq_trigger+0x60/0x80 ... ... [ 564.186804] in panic [ 564.194287] in panic_other_cpus_shutdown [ 564.203674] kexec: in crash_smp_send_stop [ 564.213205] kexec: in kdump_nmi_shootdown_cpus [ 564.224338] Kernel Offset: 0x35a00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 564.282209] in snp_shutdown_on_panic after decommission, wbinvd + df_flush required [ 564.462217] ccp 0000:23:00.1: SEV-SNP DF_FLUSH failed with error 14 [ 564.676920] kexec: in native_machine_crash_shutdown early console in extract_kernel input_data: 0x000000007410d2cc input_len: 0x0000000000ce98b2 output: 0x0000000071600000 output_len: 0x000000000379eb8c kernel_total_size: 0x0000000002c30000 needed_size: 0x0000000003800000 trampoline_32bit: 0x0000000000000000 Invalid physical address chosen! Physical KASLR disabled: no suitable memory region! Virtual KASLR using RDRAND RDTSC... Decompressing Linux... Parsing ELF... Performing relocations... done. Booting the kernel (entry_offset: 0x0000000000000bda). [ 0.000000] Linux version 6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty (amd@ethanolx7e2ehost) (gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0, GNU ld (GNU Binutils) 2.40) #61 SMP Mon Aug 19 19:59:02 UTC 2024 [ 0.000000] Command line: BOOT_IMAGE=/vmlinuz-6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty root=UUID=4b87a03b-0e78-42ca-a8ad-997e63bba4e0 ro console=tty0 console=ttyS0,115200n8 earlyprintk=ttyS0,115200n8 amd_iommu_dump=1 reset_devices systemd.unit=kdump-tools-dump.service nr_cpus=1 irqpoll nousb elfcorehdr=1916276K [ 0.000000] KERNEL supported cpus: ... ... [ 1.671804] AMD-Vi: Using global IVHD EFR:0x841f77e022094ace, EFR2:0x0 [ 1.679835] AMD-Vi: Translation is already enabled - trying to copy translation structures [ 1.689363] AMD-Vi: Copied DEV table from previous kernel. [ 1.864369] AMD-Vi: Completion-Wait loop timed out [ 2.038289] AMD-Vi: Completion-Wait loop timed out [ 2.212215] AMD-Vi: Completion-Wait loop timed out [ 2.386141] AMD-Vi: Completion-Wait loop timed out [ 2.560068] AMD-Vi: Completion-Wait loop timed out [ 2.733997] AMD-Vi: Completion-Wait loop timed out [ 2.907927] AMD-Vi: Completion-Wait loop timed out [ 3.081855] AMD-Vi: Completion-Wait loop timed out [ 3.225500] AMD-Vi: Completion-Wait loop timed out [ 3.231083] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 d out [ 3.579592] AMD-Vi: Completion-Wait loop timed out [ 3.753164] AMD-Vi: Completion-Wait loop timed out [ 3.815762] Kernel panic - not syncing: timer doesn't work through Interrupt-remapped IO-APIC [ 3.825347] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty #61 [ 3.837188] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS RXM100AB 10/17/2022 [ 3.846215] Call Trace: [ 3.848939] [ 3.851277] dump_stack_lvl+0x2b/0x90 [ 3.855354] dump_stack+0x14/0x20 [ 3.859050] panic+0x3b9/0x400 [ 3.862454] panic_if_irq_remap+0x21/0x30 [ 3.866925] setup_IO_APIC+0x8aa/0xa50 [ 3.871106] ? __pfx_amd_iommu_enable_faulting+0x10/0x10 [ 3.877032] ? __cpuhp_setup_state+0x5e/0xd0 [ 3.881793] apic_intr_mode_init+0x6a/0xf0 [ 3.886360] x86_late_time_init+0x28/0x40 [ 3.890832] start_kernel+0x6a8/0xb50 [ 3.894914] x86_64_start_reservations+0x1c/0x30 [ 3.900064] x86_64_start_kernel+0xbf/0x110 [ 3.904729] ? setup_ghcb+0x12/0x130 [ 3.908716] common_startup_64+0x13e/0x141 [ 3.913283] [ 3.915715] in panic [ 3.918149] in panic_other_cpus_shutdown [ 3.922523] ---[ end Kernel panic - not syncing: timer doesn't work through Interrupt-remapped IO-APIC ]--- This happens as SNP_SHUTDOWN_EX fails when SNP VMs are active as the firmware checks every encryption-capable ASID to verify that it is not in use by a guest and a DF_FLUSH is not required. If a DF_FLUSH is required, the firmware returns DFFLUSH_REQUIRED. To fix this, added support to do SNP_DECOMMISSION of all active SNP VMs in the panic notifier before doing SNP_SHUTDOWN_EX, but then SNP_DECOMMISSION tags all CPUs on which guest has been activated to do a WBINVD. This causes SNP_DF_FLUSH command failure with the following flow: SNP_DECOMMISSION -> SNP_SHUTDOWN_EX -> SNP_DF_FLUSH -> failure with WBINVD_REQUIRED. When panic notifier is invoked all other CPUs have already been shutdown, so it is not possible to do a wbinvd_on_all_cpus() after SNP_DECOMMISSION has been executed. This eventually causes SNP_SHUTDOWN_EX to fail after SNP_DECOMMISSION. Adding fix to do SNP_DECOMMISSION and subsequent WBINVD on all CPUs during NMI shutdown of CPUs as part of disabling virtualization on all CPUs via cpu_emergency_disable_virtualization -> svm_emergency_disable(). SNP_DECOMMISSION unbinds the ASID from SNP context and marks the ASID as unusable and then transitions the SNP guest context page to a firmware page and SNP_SHUTDOWN_EX transitions all pages associated with the IOMMU to reclaim state which the hypervisor then transitions to hypervisor state, all these page state changes are in the RMP table, so there is no loss of guest data as such and the complete host memory is captured with the crashkernel boot. There are no processes which are being killed and host/guest memory is not being altered or modified in any way. This fixes and enables crashkernel/kdump on SNP host. v2: - rename all instances of decommision to decommission - created a new function sev_emergency_disable() which is exported from sev.c and calls __snp_decommission_all() to do SNP_DECOMMISSION - added more information to commit message Fixes: c3b86e61b756 ("x86/cpufeatures: Enable/unmask SEV-SNP CPU feature") Signed-off-by: Ashish Kalra --- arch/x86/kvm/svm/sev.c | 133 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 2 + arch/x86/kvm/svm/svm.h | 3 +- 3 files changed, 137 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 714c517dd4b72b..30f286a3afb041 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -89,6 +90,8 @@ static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; +static DEFINE_SPINLOCK(snp_decommission_lock); +static void **snp_asid_to_gctx_pages_map; static int snp_decommission_context(struct kvm *kvm); struct enc_region { @@ -2248,6 +2251,9 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_context; } + if (snp_asid_to_gctx_pages_map) + snp_asid_to_gctx_pages_map[sev_get_asid(kvm)] = sev->snp_context; + return 0; e_free_context: @@ -2884,9 +2890,126 @@ static int snp_decommission_context(struct kvm *kvm) snp_free_firmware_page(sev->snp_context); sev->snp_context = NULL; + if (snp_asid_to_gctx_pages_map) + snp_asid_to_gctx_pages_map[sev_get_asid(kvm)] = NULL; + return 0; } +static void __snp_decommission_all(void) +{ + struct sev_data_snp_addr data = {}; + int ret, asid; + + if (!snp_asid_to_gctx_pages_map) + return; + + for (asid = 1; asid < min_sev_asid; asid++) { + if (snp_asid_to_gctx_pages_map[asid]) { + data.address = __sme_pa(snp_asid_to_gctx_pages_map[asid]); + ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); + if (!ret) { + snp_free_firmware_page(snp_asid_to_gctx_pages_map[asid]); + snp_asid_to_gctx_pages_map[asid] = NULL; + } + } + } +} + +/* + * NOTE: called in NMI context from svm_emergency_disable(). + */ +void sev_emergency_disable(void) +{ + static atomic_t waiting_for_cpus_synchronized; + static bool synchronize_cpus_initiated; + static bool snp_decommission_handled; + static atomic_t cpus_synchronized; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return; + + /* + * SNP_SHUTDOWN_EX fails when SNP VMs are active as the firmware checks + * every encryption-capable ASID to verify that it is not in use by a + * guest and a DF_FLUSH is not required. If a DF_FLUSH is required, + * the firmware returns DFFLUSH_REQUIRED. To address this, SNP_DECOMMISSION + * is required to shutdown all active SNP VMs, but SNP_DECOMMISSION tags all + * CPUs that guest was activated on to do a WBINVD. When panic notifier + * is invoked all other CPUs have already been shutdown, so it is not + * possible to do a wbinvd_on_all_cpus() after SNP_DECOMMISSION has been + * executed. This eventually causes SNP_SHUTDOWN_EX to fail after + * SNP_DECOMMISSION. To fix this, do SNP_DECOMMISSION and subsequent WBINVD + * on all CPUs during NMI shutdown of CPUs as part of disabling + * virtualization on all CPUs via cpu_emergency_disable_virtualization(). + */ + + spin_lock(&snp_decommission_lock); + + /* + * exit early for call from native_machine_crash_shutdown() + * as SNP_DECOMMISSION has already been done as part of + * NMI shutdown of the CPUs. + */ + if (snp_decommission_handled) { + spin_unlock(&snp_decommission_lock); + return; + } + + /* + * Synchronize all CPUs handling NMI before issuing + * SNP_DECOMMISSION. + */ + if (!synchronize_cpus_initiated) { + /* + * one CPU handling panic, the other CPU is initiator for + * CPU synchronization. + */ + atomic_set(&waiting_for_cpus_synchronized, num_online_cpus() - 2); + synchronize_cpus_initiated = true; + /* + * Ensure CPU synchronization parameters are setup before dropping + * the lock to let other CPUs continue to reach synchronization. + */ + wmb(); + + spin_unlock(&snp_decommission_lock); + + /* + * This will not cause system to hang forever as the CPU + * handling panic waits for maximum one second for + * other CPUs to stop in nmi_shootdown_cpus(). + */ + while (atomic_read(&waiting_for_cpus_synchronized) > 0) + mdelay(1); + + /* Reacquire the lock once CPUs are synchronized */ + spin_lock(&snp_decommission_lock); + + atomic_set(&cpus_synchronized, 1); + } else { + atomic_dec(&waiting_for_cpus_synchronized); + /* + * drop the lock to let other CPUs contiune to reach + * synchronization. + */ + spin_unlock(&snp_decommission_lock); + + while (atomic_read(&cpus_synchronized) == 0) + mdelay(1); + + /* Try to re-acquire lock after CPUs are synchronized */ + spin_lock(&snp_decommission_lock); + } + + if (!snp_decommission_handled) { + __snp_decommission_all(); + snp_decommission_handled = true; + } + spin_unlock(&snp_decommission_lock); + wbinvd(); +} + void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -3052,6 +3175,13 @@ void __init sev_hardware_setup(void) sev_es_supported = true; sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); + if (sev_snp_supported) { + snp_asid_to_gctx_pages_map = kmalloc_array(min_sev_asid, + sizeof(void *), + GFP_KERNEL | __GFP_ZERO); + if (!snp_asid_to_gctx_pages_map) + pr_warn("Could not allocate SNP asid to guest context map\n"); + } out: if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", @@ -3094,6 +3224,9 @@ void sev_hardware_unsetup(void) misc_cg_set_capacity(MISC_CG_RES_SEV, 0); misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); + + kfree(snp_asid_to_gctx_pages_map); + snp_asid_to_gctx_pages_map = NULL; } int sev_cpu_init(struct svm_cpu_data *sd) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a04f6627b23717..42846f2bb280b2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -597,6 +597,8 @@ static void svm_emergency_disable(void) kvm_rebooting = true; kvm_cpu_svm_disable(); + + sev_emergency_disable(); } static void svm_hardware_disable(void) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 76107c7d0595d7..cc4f21f45def26 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -749,6 +749,7 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +void sev_emergency_disable(void); #else static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) { @@ -779,7 +780,7 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { return 0; } - +static void sev_emergency_disable(void) {} #endif /* vmenter.S */