From a6efe33cc5945c8d435d2441ecc4e0ca7c49e040 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 17 Jul 2024 08:01:22 +0200 Subject: [PATCH 1/4] riscv: Add ISA extension parsing for Svvptc Add support to parse the Svvptc string in the riscv,isa string. Signed-off-by: Alexandre Ghiti Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240717060125.139416-2-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hwcap.h | 1 + arch/riscv/kernel/cpufeature.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index e17d0078a65116..6dd0dd8beb3033 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -81,6 +81,7 @@ #define RISCV_ISA_EXT_ZTSO 72 #define RISCV_ISA_EXT_ZACAS 73 #define RISCV_ISA_EXT_XANDESPMU 74 +#define RISCV_ISA_EXT_SVVPTC 75 #define RISCV_ISA_EXT_XLINUXENVCFG 127 diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 5ef48cb20ee119..60780d246743e3 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -305,6 +305,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT), __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT), __RISCV_ISA_EXT_DATA(xandespmu, RISCV_ISA_EXT_XANDESPMU), + __RISCV_ISA_EXT_DATA(svvptc, RISCV_ISA_EXT_SVVPTC), }; const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext); From d25599b5933fb5f89d4b4c720564d613a795f502 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 17 Jul 2024 08:01:23 +0200 Subject: [PATCH 2/4] dt-bindings: riscv: Add Svvptc ISA extension description Add description for the Svvptc ISA extension which was ratified recently. Signed-off-by: Alexandre Ghiti Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20240717060125.139416-3-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/riscv/extensions.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml b/Documentation/devicetree/bindings/riscv/extensions.yaml index 468c646247aa5c..b52375bea5123d 100644 --- a/Documentation/devicetree/bindings/riscv/extensions.yaml +++ b/Documentation/devicetree/bindings/riscv/extensions.yaml @@ -171,6 +171,13 @@ properties: memory types as ratified in the 20191213 version of the privileged ISA specification. + - const: svvptc + description: + The standard Svvptc supervisor-level extension for + address-translation cache behaviour with respect to invalid entries + as ratified at commit 4a69197e5617 ("Update to ratified state") of + riscv-svvptc. + - const: zacas description: | The Zacas extension for Atomic Compare-and-Swap (CAS) instructions From 503638e0babf364061bc50fca5103b00a56cc50a Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 17 Jul 2024 08:01:24 +0200 Subject: [PATCH 3/4] riscv: Stop emitting preventive sfence.vma for new vmalloc mappings In 6.5, we removed the vmalloc fault path because that can't work (see [1] [2]). Then in order to make sure that new page table entries were seen by the page table walker, we had to preventively emit a sfence.vma on all harts [3] but this solution is very costly since it relies on IPI. And even there, we could end up in a loop of vmalloc faults if a vmalloc allocation is done in the IPI path (for example if it is traced, see [4]), which could result in a kernel stack overflow. Those preventive sfence.vma needed to be emitted because: - if the uarch caches invalid entries, the new mapping may not be observed by the page table walker and an invalidation may be needed. - if the uarch does not cache invalid entries, a reordered access could "miss" the new mapping and traps: in that case, we would actually only need to retry the access, no sfence.vma is required. So this patch removes those preventive sfence.vma and actually handles the possible (and unlikely) exceptions. And since the kernel stacks mappings lie in the vmalloc area, this handling must be done very early when the trap is taken, at the very beginning of handle_exception: this also rules out the vmalloc allocations in the fault path. Link: https://lore.kernel.org/linux-riscv/20230531093817.665799-1-bjorn@kernel.org/ [1] Link: https://lore.kernel.org/linux-riscv/20230801090927.2018653-1-dylan@andestech.com [2] Link: https://lore.kernel.org/linux-riscv/20230725132246.817726-1-alexghiti@rivosinc.com/ [3] Link: https://lore.kernel.org/lkml/20200508144043.13893-1-joro@8bytes.org/ [4] Signed-off-by: Alexandre Ghiti Reviewed-by: Yunhui Cui Link: https://lore.kernel.org/r/20240717060125.139416-4-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cacheflush.h | 18 +++++- arch/riscv/include/asm/thread_info.h | 7 +++ arch/riscv/kernel/asm-offsets.c | 7 +++ arch/riscv/kernel/entry.S | 87 ++++++++++++++++++++++++++++ arch/riscv/mm/init.c | 2 + 5 files changed, 120 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index ce79c558a4c85a..8de73f91bfa371 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -46,7 +46,23 @@ do { \ } while (0) #ifdef CONFIG_64BIT -#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end) +extern u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1]; +extern char _end[]; +#define flush_cache_vmap flush_cache_vmap +static inline void flush_cache_vmap(unsigned long start, unsigned long end) +{ + if (is_vmalloc_or_module_addr((void *)start)) { + int i; + + /* + * We don't care if concurrently a cpu resets this value since + * the only place this can happen is in handle_exception() where + * an sfence.vma is emitted. + */ + for (i = 0; i < ARRAY_SIZE(new_vmalloc); ++i) + new_vmalloc[i] = -1ULL; + } +} #define flush_cache_vmap_early(start, end) local_flush_tlb_kernel_range(start, end) #endif diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index 5d473343634b9d..0ddf1123b5ba26 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -60,6 +60,13 @@ struct thread_info { void *scs_base; void *scs_sp; #endif +#ifdef CONFIG_64BIT + /* + * Used in handle_exception() to save a0, a1 and a2 before knowing if we + * can access the kernel stack. + */ + unsigned long a0, a1, a2; +#endif }; #ifdef CONFIG_SHADOW_CALL_STACK diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index b09ca5f944f77b..e94180ba432f59 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -36,6 +36,8 @@ void asm_offsets(void) OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]); OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]); OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]); + + OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu); OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags); OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count); OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp); @@ -43,6 +45,11 @@ void asm_offsets(void) #ifdef CONFIG_SHADOW_CALL_STACK OFFSET(TASK_TI_SCS_SP, task_struct, thread_info.scs_sp); #endif +#ifdef CONFIG_64BIT + OFFSET(TASK_TI_A0, task_struct, thread_info.a0); + OFFSET(TASK_TI_A1, task_struct, thread_info.a1); + OFFSET(TASK_TI_A2, task_struct, thread_info.a2); +#endif OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu); OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]); diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 68a24cf9481afe..d80b90f99bc184 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -19,6 +19,79 @@ .section .irqentry.text, "ax" +.macro new_vmalloc_check + REG_S a0, TASK_TI_A0(tp) + csrr a0, CSR_CAUSE + /* Exclude IRQs */ + blt a0, zero, _new_vmalloc_restore_context_a0 + + REG_S a1, TASK_TI_A1(tp) + /* Only check new_vmalloc if we are in page/protection fault */ + li a1, EXC_LOAD_PAGE_FAULT + beq a0, a1, _new_vmalloc_kernel_address + li a1, EXC_STORE_PAGE_FAULT + beq a0, a1, _new_vmalloc_kernel_address + li a1, EXC_INST_PAGE_FAULT + bne a0, a1, _new_vmalloc_restore_context_a1 + +_new_vmalloc_kernel_address: + /* Is it a kernel address? */ + csrr a0, CSR_TVAL + bge a0, zero, _new_vmalloc_restore_context_a1 + + /* Check if a new vmalloc mapping appeared that could explain the trap */ + REG_S a2, TASK_TI_A2(tp) + /* + * Computes: + * a0 = &new_vmalloc[BIT_WORD(cpu)] + * a1 = BIT_MASK(cpu) + */ + REG_L a2, TASK_TI_CPU(tp) + /* + * Compute the new_vmalloc element position: + * (cpu / 64) * 8 = (cpu >> 6) << 3 + */ + srli a1, a2, 6 + slli a1, a1, 3 + la a0, new_vmalloc + add a0, a0, a1 + /* + * Compute the bit position in the new_vmalloc element: + * bit_pos = cpu % 64 = cpu - (cpu / 64) * 64 = cpu - (cpu >> 6) << 6 + * = cpu - ((cpu >> 6) << 3) << 3 + */ + slli a1, a1, 3 + sub a1, a2, a1 + /* Compute the "get mask": 1 << bit_pos */ + li a2, 1 + sll a1, a2, a1 + + /* Check the value of new_vmalloc for this cpu */ + REG_L a2, 0(a0) + and a2, a2, a1 + beq a2, zero, _new_vmalloc_restore_context + + /* Atomically reset the current cpu bit in new_vmalloc */ + amoxor.d a0, a1, (a0) + + /* Only emit a sfence.vma if the uarch caches invalid entries */ + ALTERNATIVE("sfence.vma", "nop", 0, RISCV_ISA_EXT_SVVPTC, 1) + + REG_L a0, TASK_TI_A0(tp) + REG_L a1, TASK_TI_A1(tp) + REG_L a2, TASK_TI_A2(tp) + csrw CSR_SCRATCH, x0 + sret + +_new_vmalloc_restore_context: + REG_L a2, TASK_TI_A2(tp) +_new_vmalloc_restore_context_a1: + REG_L a1, TASK_TI_A1(tp) +_new_vmalloc_restore_context_a0: + REG_L a0, TASK_TI_A0(tp) +.endm + + SYM_CODE_START(handle_exception) /* * If coming from userspace, preserve the user thread pointer and load @@ -30,6 +103,20 @@ SYM_CODE_START(handle_exception) .Lrestore_kernel_tpsp: csrr tp, CSR_SCRATCH + +#ifdef CONFIG_64BIT + /* + * The RISC-V kernel does not eagerly emit a sfence.vma after each + * new vmalloc mapping, which may result in exceptions: + * - if the uarch caches invalid entries, the new mapping would not be + * observed by the page table walker and an invalidation is needed. + * - if the uarch does not cache invalid entries, a reordered access + * could "miss" the new mapping and traps: in that case, we only need + * to retry the access, no sfence.vma is required. + */ + new_vmalloc_check +#endif + REG_S sp, TASK_TI_KERNEL_SP(tp) #ifdef CONFIG_VMAP_STACK diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index e3218d65f21d5a..337dab7d5fd713 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -36,6 +36,8 @@ #include "../kernel/head.h" +u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1]; + struct kernel_mapping kernel_map __ro_after_init; EXPORT_SYMBOL(kernel_map); #ifdef CONFIG_XIP_KERNEL From 7a21b2e370dab780ddb3aa80f2a4c8ff97bddccc Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 17 Jul 2024 08:01:25 +0200 Subject: [PATCH 4/4] riscv: Stop emitting preventive sfence.vma for new userspace mappings with Svvptc The preventive sfence.vma were emitted because new mappings must be made visible to the page table walker but Svvptc guarantees that it will happen within a bounded timeframe, so no need to sfence.vma for the uarchs that implement this extension, we will then take gratuitous (but very unlikely) page faults, similarly to x86 and arm64. This allows to drastically reduce the number of sfence.vma emitted: * Ubuntu boot to login: Before: ~630k sfence.vma After: ~200k sfence.vma * ltp - mmapstress01 Before: ~45k After: ~6.3k * lmbench - lat_pagefault Before: ~665k After: 832 (!) * lmbench - lat_mmap Before: ~546k After: 718 (!) Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240717060125.139416-5-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 16 +++++++++++++++- arch/riscv/mm/pgtable.c | 13 +++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index aad8b8ca51f120..4ed85329317d5a 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -476,6 +476,9 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int nr) { + asm goto(ALTERNATIVE("nop", "j %l[svvptc]", 0, RISCV_ISA_EXT_SVVPTC, 1) + : : : : svvptc); + /* * The kernel assumes that TLBs don't cache invalid entries, but * in RISC-V, SFENCE.VMA specifies an ordering constraint, not a @@ -485,12 +488,23 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, */ while (nr--) local_flush_tlb_page(address + nr * PAGE_SIZE); + +svvptc:; + /* + * Svvptc guarantees that the new valid pte will be visible within + * a bounded timeframe, so when the uarch does not cache invalid + * entries, we don't have to do anything. + */ } #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define __HAVE_ARCH_UPDATE_MMU_TLB -#define update_mmu_tlb update_mmu_cache +static inline void update_mmu_tlb(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + flush_tlb_range(vma, address, address + PAGE_SIZE); +} static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index 533ec9055fa0da..4ae67324f99233 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -9,6 +9,9 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { + asm goto(ALTERNATIVE("nop", "j %l[svvptc]", 0, RISCV_ISA_EXT_SVVPTC, 1) + : : : : svvptc); + if (!pte_same(ptep_get(ptep), entry)) __set_pte_at(vma->vm_mm, ptep, entry); /* @@ -16,6 +19,16 @@ int ptep_set_access_flags(struct vm_area_struct *vma, * the case that the PTE changed and the spurious fault case. */ return true; + +svvptc: + if (!pte_same(ptep_get(ptep), entry)) { + __set_pte_at(vma->vm_mm, ptep, entry); + /* Here only not svadu is impacted */ + flush_tlb_page(vma, address); + return true; + } + + return false; } int ptep_test_and_clear_young(struct vm_area_struct *vma,