From 2790af35f33b3961aa259564de0fc1993b7f3f9d Mon Sep 17 00:00:00 2001
From: Huaqi Fang <578567190@qq.com>
Date: Wed, 28 Oct 2020 11:35:48 +0800
Subject: [PATCH 1/4] [arch][riscv][nuclei] Add Nuclei RISC-V processsor
 support

* This PR add Nuclei RISC-V Processor which use CLIC interrupt
  system instead of PLIC and CLINT interrupt system
* This PR is also using Nuclei NMSIS and SDK,
  see https://github.com/Nuclei-Software/NMSIS and
  https://github.com/Nuclei-Software/nuclei-sdk
* In this PR, the context switch is done using CLIC software
  interrupt
* For RISC-V CLIC spec, please check https://github.com/riscv/riscv-fast-interrupt/blob/master/clic.adoc
* To evaluate this support, Nuclei HummingBird RISC-V SoC
  is supported, please check https://nucleisys.com/developboard.php
  for this board introduction

Signed-off-by: Huaqi Fang <578567190@qq.com>
---
 arch/riscv/arch.c                             |    49 +
 arch/riscv/asm.S                              |   381 +
 arch/riscv/exceptions.c                       |    14 +
 arch/riscv/include/arch/arch_thread.h         |    40 +
 arch/riscv/include/arch/riscv.h               |     5 +
 arch/riscv/linker-twosegment.ld               |    14 +-
 arch/riscv/rules.mk                           |    13 +
 arch/riscv/start.S                            |     4 +-
 arch/riscv/thread.c                           |    82 +-
 arch/riscv/time.c                             |    73 +
 arch/riscv/vectab.S                           |    52 +
 .../NMSIS/Core/Include/core_compatiable.h     |   232 +
 .../NMSIS/Core/Include/core_feature_base.h    |  1177 +
 .../NMSIS/Core/Include/core_feature_cache.h   |   124 +
 .../NMSIS/Core/Include/core_feature_dsp.h     | 18659 ++++++++++++++++
 .../NMSIS/Core/Include/core_feature_eclic.h   |   897 +
 .../NMSIS/Core/Include/core_feature_fpu.h     |   304 +
 .../NMSIS/Core/Include/core_feature_pmp.h     |   260 +
 .../NMSIS/Core/Include/core_feature_timer.h   |   364 +
 .../NMSIS/Core/Include/nmsis_compiler.h       |    37 +
 .../nuclei/NMSIS/Core/Include/nmsis_core.h    |    87 +
 .../nuclei/NMSIS/Core/Include/nmsis_gcc.h     |   265 +
 .../nuclei/NMSIS/Core/Include/nmsis_version.h |    87 +
 .../nuclei/NMSIS/Core/Include/riscv_bits.h    |    92 +
 .../NMSIS/Core/Include/riscv_encoding.h       |   617 +
 external/arch/riscv/nuclei/NMSIS/rules.mk     |     3 +
 external/platform/hbird/NMSIS/hbird.h         |   459 +
 external/platform/hbird/NMSIS/rules.mk        |     4 +
 external/platform/hbird/NMSIS/system_hbird.h  |    79 +
 external/platform/hbird/inc/hbird_gpio.h      |    56 +
 external/platform/hbird/inc/hbird_uart.h      |    75 +
 external/platform/hbird/inc/nuclei_sdk_soc.h  |    17 +
 external/platform/hbird/rules.mk              |    15 +
 external/platform/hbird/src/hbird_common.c    |    69 +
 external/platform/hbird/src/hbird_gpio.c      |   180 +
 external/platform/hbird/src/hbird_uart.c      |   105 +
 external/platform/hbird/src/system_hbird.c    |   402 +
 platform/nuclei-hbird/platform.c              |    36 +
 platform/nuclei-hbird/platform_p.h            |    16 +
 platform/nuclei-hbird/rules.mk                |    25 +
 platform/nuclei-hbird/uart.c                  |    93 +
 platform/nuclei-hbird/vectab.c                |    94 +
 project/nuclei-hbird.mk                       |     8 +
 project/target/nuclei-hbird.mk                |     2 +
 .../nuclei-hbird/include/board_hbird_eval.h   |    37 +
 target/nuclei-hbird/include/nuclei_sdk_hal.h  |    20 +
 .../include/platform/nuclei-hbird.h           |    28 +
 target/nuclei-hbird/openocd_hbird.cfg         |    50 +
 target/nuclei-hbird/rules.mk                  |    34 +
 target/nuclei-hbird/target.c                  |    24 +
 50 files changed, 25845 insertions(+), 15 deletions(-)
 create mode 100644 arch/riscv/vectab.S
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/core_compatiable.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_base.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_cache.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_dsp.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_eclic.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_fpu.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_pmp.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_timer.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_compiler.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_core.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_gcc.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_version.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/riscv_bits.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/Core/Include/riscv_encoding.h
 create mode 100644 external/arch/riscv/nuclei/NMSIS/rules.mk
 create mode 100644 external/platform/hbird/NMSIS/hbird.h
 create mode 100644 external/platform/hbird/NMSIS/rules.mk
 create mode 100644 external/platform/hbird/NMSIS/system_hbird.h
 create mode 100644 external/platform/hbird/inc/hbird_gpio.h
 create mode 100644 external/platform/hbird/inc/hbird_uart.h
 create mode 100644 external/platform/hbird/inc/nuclei_sdk_soc.h
 create mode 100644 external/platform/hbird/rules.mk
 create mode 100644 external/platform/hbird/src/hbird_common.c
 create mode 100644 external/platform/hbird/src/hbird_gpio.c
 create mode 100644 external/platform/hbird/src/hbird_uart.c
 create mode 100644 external/platform/hbird/src/system_hbird.c
 create mode 100644 platform/nuclei-hbird/platform.c
 create mode 100644 platform/nuclei-hbird/platform_p.h
 create mode 100644 platform/nuclei-hbird/rules.mk
 create mode 100644 platform/nuclei-hbird/uart.c
 create mode 100644 platform/nuclei-hbird/vectab.c
 create mode 100644 project/nuclei-hbird.mk
 create mode 100644 project/target/nuclei-hbird.mk
 create mode 100644 target/nuclei-hbird/include/board_hbird_eval.h
 create mode 100644 target/nuclei-hbird/include/nuclei_sdk_hal.h
 create mode 100644 target/nuclei-hbird/include/platform/nuclei-hbird.h
 create mode 100644 target/nuclei-hbird/openocd_hbird.cfg
 create mode 100644 target/nuclei-hbird/rules.mk
 create mode 100644 target/nuclei-hbird/target.c

diff --git a/arch/riscv/arch.c b/arch/riscv/arch.c
index fc1e4773e..913ff2ec1 100644
--- a/arch/riscv/arch.c
+++ b/arch/riscv/arch.c
@@ -18,6 +18,11 @@
 
 #include "riscv_priv.h"
 
+#ifdef RISCV_VARIANT_NUCLEI
+#include <riscv_encoding.h>
+volatile unsigned long riscv_reschedule = 0;
+#endif
+
 #define LOCAL_TRACE 0
 
 // per cpu structure, pointed to by xscratch
@@ -35,6 +40,36 @@ void riscv_configure_percpu_early(uint hart_id) {
 
 // first C level code to initialize each cpu
 void riscv_early_init_percpu(void) {
+#ifdef RISCV_VARIANT_NUCLEI
+    extern void *vectab;
+    extern void exc_entry(void);
+    extern void irq_entry(void);
+    extern void _premain_init(void);
+    extern void platform_init_timer(void);
+    extern unsigned long default_stack_top;
+    unsigned long entry_tmp = 0;
+
+    // set nmi exception to mtvec
+    riscv_csr_set(CSR_MMISC_CTL, MMISC_CTL_NMI_CAUSE_FFF);
+    // set clic vector base
+    riscv_csr_write(CSR_MTVT, (uintptr_t)&vectab);
+    entry_tmp = ((unsigned long)irq_entry) | 0x1;
+    // set clic non-vector irq entry
+    riscv_csr_write(CSR_MTVT2, (uintptr_t)entry_tmp);
+    entry_tmp = (((unsigned long)exc_entry) & (~(0x3FUL))) | 0x3;
+    // set exception entry and enable clic mode
+    riscv_csr_write(CSR_MTVEC, (uintptr_t)entry_tmp);
+    // enable cycle and instret counter
+    riscv_csr_set(mcounteren, 0x5);
+    // set csr mscratch for interrupt stack usage
+    
+    _premain_init();
+
+    platform_init_timer();
+    // mask all exceptions, just in case
+    riscv_csr_clear(RISCV_CSR_XSTATUS, RISCV_CSR_XSTATUS_IE);
+    riscv_csr_write(CSR_MSCRATCH, &default_stack_top);
+#else
     // set the top level exception handler
     riscv_csr_write(RISCV_CSR_XTVEC, (uintptr_t)&riscv_exception_entry);
 
@@ -44,6 +79,7 @@ void riscv_early_init_percpu(void) {
 
     // enable cycle counter (disabled for now, unimplemented on sifive-e)
     //riscv_csr_set(mcounteren, 1);
+#endif
 }
 
 // called very early just after entering C code on boot processor
@@ -62,6 +98,18 @@ void riscv_init_percpu(void) {
     riscv_csr_set(RISCV_CSR_XIE, RISCV_CSR_XIE_EIE);
 }
 
+#ifdef RISCV_VARIANT_NUCLEI
+void riscv_clic_irq_entry(void) {
+    THREAD_STATS_INC(interrupts);
+}
+
+void riscv_clic_irq_exit(bool reschedule) {
+    if (reschedule != INT_NO_RESCHEDULE) {
+        riscv_reschedule = reschedule;
+    }
+}
+#endif
+
 // called later once the kernel is running before platform and target init
 void arch_init(void) {
     riscv_init_percpu();
@@ -95,6 +143,7 @@ void arch_init(void) {
 void arch_idle(void) {
     // let the platform/target disable wfi
 #if !RISCV_DISABLE_WFI
+    printf("Idle\n");
     __asm__ volatile("wfi");
 #endif
 }
diff --git a/arch/riscv/asm.S b/arch/riscv/asm.S
index f5f201d51..54cfa83b7 100644
--- a/arch/riscv/asm.S
+++ b/arch/riscv/asm.S
@@ -5,6 +5,7 @@
  * license that can be found in the LICENSE file or at
  * https://opensource.org/licenses/MIT
  */
+#ifndef RISCV_VARIANT_NUCLEI
 #include <lk/asm.h>
 #include <arch/riscv.h>
 #include <arch/riscv/asm.h>
@@ -104,3 +105,383 @@ FUNCTION(riscv_exception_entry)
 
     RISCV_XRET
 END_FUNCTION(riscv_exception_entry)
+#endif
+
+#ifdef RISCV_VARIANT_NUCLEI
+#include <riscv_encoding.h>
+
+#ifndef __riscv_32e
+#define portRegNum          30
+#else
+#define portRegNum          14
+#endif
+
+#define portCONTEXT_SIZE    ( portRegNum * REGBYTES )
+
+    .extern rt_interrupt_from_thread
+    .extern rt_interrupt_to_thread
+
+.align 8
+
+/**
+ * \brief  Global interrupt disabled
+ * \details
+ *  This function disable global interrupt.
+ * \remarks
+ *  - All the interrupt requests will be ignored by CPU.
+ */
+.macro DISABLE_MIE
+    csrc CSR_MSTATUS, MSTATUS_MIE
+.endm
+
+/**
+ * \brief  Macro for context save
+ * \details
+ * This macro save ABI defined caller saved registers in the stack.
+ * \remarks
+ * - This Macro could use to save context when you enter to interrupt
+ * or exception
+*/
+/* Save caller registers */
+.macro SAVE_CONTEXT
+    csrrw sp, CSR_MSCRATCHCSWL, sp
+    /* Allocate stack space for context saving */
+#ifndef __riscv_32e
+    addi sp, sp, -20*REGBYTES
+#else
+    addi sp, sp, -14*REGBYTES
+#endif /* __riscv_32e */
+
+    STORE x1, 0*REGBYTES(sp)
+    /* STORE x4, 1*REGBYTES(sp) */
+    STORE x5, 2*REGBYTES(sp)
+    STORE x6, 3*REGBYTES(sp)
+    STORE x7, 4*REGBYTES(sp)
+    STORE x10, 5*REGBYTES(sp)
+    STORE x11, 6*REGBYTES(sp)
+    STORE x12, 7*REGBYTES(sp)
+    STORE x13, 8*REGBYTES(sp)
+    STORE x14, 9*REGBYTES(sp)
+    STORE x15, 10*REGBYTES(sp)
+#ifndef __riscv_32e
+    STORE x16, 14*REGBYTES(sp)
+    STORE x17, 15*REGBYTES(sp)
+    STORE x28, 16*REGBYTES(sp)
+    STORE x29, 17*REGBYTES(sp)
+    STORE x30, 18*REGBYTES(sp)
+    STORE x31, 19*REGBYTES(sp)
+#endif /* __riscv_32e */
+.endm
+
+/**
+ * \brief  Macro for restore caller registers
+ * \details
+ * This macro restore ABI defined caller saved registers from stack.
+ * \remarks
+ * - You could use this macro to restore context before you want return
+ * from interrupt or exeception
+ */
+/* Restore caller registers */
+.macro RESTORE_CONTEXT
+    LOAD x1, 0*REGBYTES(sp)
+    /* LOAD x4, 1*REGBYTES(sp) */
+    LOAD x5, 2*REGBYTES(sp)
+    LOAD x6, 3*REGBYTES(sp)
+    LOAD x7, 4*REGBYTES(sp)
+    LOAD x10, 5*REGBYTES(sp)
+    LOAD x11, 6*REGBYTES(sp)
+    LOAD x12, 7*REGBYTES(sp)
+    LOAD x13, 8*REGBYTES(sp)
+    LOAD x14, 9*REGBYTES(sp)
+    LOAD x15, 10*REGBYTES(sp)
+#ifndef __riscv_32e
+    LOAD x16, 14*REGBYTES(sp)
+    LOAD x17, 15*REGBYTES(sp)
+    LOAD x28, 16*REGBYTES(sp)
+    LOAD x29, 17*REGBYTES(sp)
+    LOAD x30, 18*REGBYTES(sp)
+    LOAD x31, 19*REGBYTES(sp)
+
+    /* De-allocate the stack space */
+    addi sp, sp, 20*REGBYTES
+#else
+    /* De-allocate the stack space */
+    addi sp, sp, 14*REGBYTES
+#endif /* __riscv_32e */
+    csrrw sp, CSR_MSCRATCHCSWL, sp
+.endm
+
+/**
+ * \brief  Macro for save necessary CSRs to stack
+ * \details
+ * This macro store MCAUSE, MEPC, MSUBM to stack.
+ */
+.macro SAVE_CSR_CONTEXT
+    /* Store CSR mcause to stack using pushmcause */
+    csrrwi  x0, CSR_PUSHMCAUSE, 11
+    /* Store CSR mepc to stack using pushmepc */
+    csrrwi  x0, CSR_PUSHMEPC, 12
+    /* Store CSR msub to stack using pushmsub */
+    csrrwi  x0, CSR_PUSHMSUBM, 13
+.endm
+
+/**
+ * \brief  Macro for restore necessary CSRs from stack
+ * \details
+ * This macro restore MSUBM, MEPC, MCAUSE from stack.
+ */
+.macro RESTORE_CSR_CONTEXT
+    LOAD x5,  13*REGBYTES(sp)
+    csrw CSR_MSUBM, x5
+    LOAD x5,  12*REGBYTES(sp)
+    csrw CSR_MEPC, x5
+    LOAD x5,  11*REGBYTES(sp)
+    csrw CSR_MCAUSE, x5
+.endm
+
+/**
+ * \brief  Exception/NMI Entry
+ * \details
+ * This function provide common entry functions for exception/nmi.
+ * \remarks
+ * This function provide a default exception/nmi entry.
+ * ABI defined caller save register and some CSR registers
+ * to be saved before enter interrupt handler and be restored before return.
+ */
+.section .text.trap
+/* In CLIC mode, the exeception entry must be 64bytes aligned */
+.align 6
+.global exc_entry
+exc_entry:
+    /* Save the caller saving registers (context) */
+    SAVE_CONTEXT
+    /* Save the necessary CSR registers */
+    SAVE_CSR_CONTEXT
+
+    /*
+     * Set the exception handler function arguments
+     * argument 1: mcause value
+     * argument 2: current stack point(SP) value
+     */
+    csrr a0, mcause
+    mv a1, sp
+    /*
+     * TODO: Call the exception handler function
+     * By default, the function template is provided in
+     * system_Device.c, you can adjust it as you want
+     */
+    call core_exception_handler
+
+    /* Restore the necessary CSR registers */
+    RESTORE_CSR_CONTEXT
+    /* Restore the caller saving registers (context) */
+    RESTORE_CONTEXT
+
+    /* Return to regular code */
+    mret
+
+/**
+ * \brief  Non-Vector Interrupt Entry
+ * \details
+ * This function provide common entry functions for handling
+ * non-vector interrupts
+ * \remarks
+ * This function provide a default non-vector interrupt entry.
+ * ABI defined caller save register and some CSR registers need
+ * to be saved before enter interrupt handler and be restored before return.
+ */
+.section      .text.irq
+/* In CLIC mode, the interrupt entry must be 4bytes aligned */
+.align 2
+.global irq_entry
+/* This label will be set to MTVT2 register */
+irq_entry:
+    /* Save the caller saving registers (context) */
+    SAVE_CONTEXT
+    /* Save the necessary CSR registers */
+    SAVE_CSR_CONTEXT
+
+    /* This special CSR read/write operation, which is actually
+     * claim the CLIC to find its pending highest ID, if the ID
+     * is not 0, then automatically enable the mstatus.MIE, and
+     * jump to its vector-entry-label, and update the link register
+     */
+    csrrw ra, CSR_JALMNXTI, ra
+
+    /* Critical section with interrupts disabled */
+    DISABLE_MIE
+
+    jal riscv_irq_exit
+
+    /* Restore the necessary CSR registers */
+    RESTORE_CSR_CONTEXT
+    /* Restore the caller saving registers (context) */
+    RESTORE_CONTEXT
+
+    /* Return to regular code */
+    mret
+
+/* Default Handler for Exceptions / Interrupts */
+.global default_intexc_handler
+.weak default_intexc_handler
+Undef_Handler:
+default_intexc_handler:
+1:
+    j 1b
+
+    .global arch_context_start
+
+/* Start the first task.  This also clears the bit that indicates the FPU is
+    in use in case the FPU was used before the scheduler was started - which
+    would otherwise result in the unnecessary leaving of space in the stack
+    for lazy saving of FPU registers. */
+.align 3
+arch_context_start:
+    /* Setup Interrupt Stack using
+       The stack that was used by main()
+       before the scheduler is started is
+       no longer required after the scheduler is started.
+       Interrupt stack pointer is stored in CSR_MSCRATCH */
+    LOAD sp, 0x0(a0)                /* Read sp from first TCB member(a0) */
+
+    /* Pop PC from stack and set MEPC */
+    LOAD t0,  0  * REGBYTES(sp)
+    csrw CSR_MEPC, t0
+    /* Pop mstatus from stack and set it */
+    LOAD t0,  (portRegNum - 1)  * REGBYTES(sp)
+    csrw CSR_MSTATUS, t0
+    /* Interrupt still disable here */
+    /* Restore Registers from Stack */
+    LOAD x1,  1  * REGBYTES(sp)    /* RA */
+    LOAD x5,  2  * REGBYTES(sp)
+    LOAD x6,  3  * REGBYTES(sp)
+    LOAD x7,  4  * REGBYTES(sp)
+    LOAD x8,  5  * REGBYTES(sp)
+    LOAD x9,  6  * REGBYTES(sp)
+    LOAD x10, 7  * REGBYTES(sp)
+    LOAD x11, 8  * REGBYTES(sp)
+    LOAD x12, 9  * REGBYTES(sp)
+    LOAD x13, 10 * REGBYTES(sp)
+    LOAD x14, 11 * REGBYTES(sp)
+    LOAD x15, 12 * REGBYTES(sp)
+#ifndef __riscv_32e
+    LOAD x16, 13 * REGBYTES(sp)
+    LOAD x17, 14 * REGBYTES(sp)
+    LOAD x18, 15 * REGBYTES(sp)
+    LOAD x19, 16 * REGBYTES(sp)
+    LOAD x20, 17 * REGBYTES(sp)
+    LOAD x21, 18 * REGBYTES(sp)
+    LOAD x22, 19 * REGBYTES(sp)
+    LOAD x23, 20 * REGBYTES(sp)
+    LOAD x24, 21 * REGBYTES(sp)
+    LOAD x25, 22 * REGBYTES(sp)
+    LOAD x26, 23 * REGBYTES(sp)
+    LOAD x27, 24 * REGBYTES(sp)
+    LOAD x28, 25 * REGBYTES(sp)
+    LOAD x29, 26 * REGBYTES(sp)
+    LOAD x30, 27 * REGBYTES(sp)
+    LOAD x31, 28 * REGBYTES(sp)
+#endif
+
+    addi sp, sp, portCONTEXT_SIZE
+
+    mret
+
+.align 2
+.global riscv_msip_handler
+riscv_msip_handler:
+    addi sp, sp, -portCONTEXT_SIZE
+    STORE x1,  1  * REGBYTES(sp)    /* RA */
+    STORE x5,  2  * REGBYTES(sp)
+    STORE x6,  3  * REGBYTES(sp)
+    STORE x7,  4  * REGBYTES(sp)
+    STORE x8,  5  * REGBYTES(sp)
+    STORE x9,  6  * REGBYTES(sp)
+    STORE x10, 7  * REGBYTES(sp)
+    STORE x11, 8  * REGBYTES(sp)
+    STORE x12, 9  * REGBYTES(sp)
+    STORE x13, 10 * REGBYTES(sp)
+    STORE x14, 11 * REGBYTES(sp)
+    STORE x15, 12 * REGBYTES(sp)
+#ifndef __riscv_32e
+    STORE x16, 13 * REGBYTES(sp)
+    STORE x17, 14 * REGBYTES(sp)
+    STORE x18, 15 * REGBYTES(sp)
+    STORE x19, 16 * REGBYTES(sp)
+    STORE x20, 17 * REGBYTES(sp)
+    STORE x21, 18 * REGBYTES(sp)
+    STORE x22, 19 * REGBYTES(sp)
+    STORE x23, 20 * REGBYTES(sp)
+    STORE x24, 21 * REGBYTES(sp)
+    STORE x25, 22 * REGBYTES(sp)
+    STORE x26, 23 * REGBYTES(sp)
+    STORE x27, 24 * REGBYTES(sp)
+    STORE x28, 25 * REGBYTES(sp)
+    STORE x29, 26 * REGBYTES(sp)
+    STORE x30, 27 * REGBYTES(sp)
+    STORE x31, 28 * REGBYTES(sp)
+#endif
+    /* Push mstatus to stack */
+    csrr t0, CSR_MSTATUS
+    STORE t0,  (portRegNum - 1)  * REGBYTES(sp)
+
+    /* Push additional registers */
+
+    /* Store sp to task stack */
+    LOAD t0, rt_interrupt_from_thread
+    STORE sp, 0(t0)
+
+    csrr t0, CSR_MEPC
+    STORE t0, 0(sp)
+
+    jal riscv_msip_process
+
+    /* Switch task context */
+    LOAD t0, rt_interrupt_to_thread
+    LOAD sp, 0x0(t0)
+
+    /* Pop PC from stack and set MEPC */
+    LOAD t0,  0  * REGBYTES(sp)
+    csrw CSR_MEPC, t0
+    /* Pop additional registers */
+
+    /* Pop mstatus from stack and set it */
+    LOAD t0,  (portRegNum - 1)  * REGBYTES(sp)
+    csrw CSR_MSTATUS, t0
+    /* Interrupt still disable here */
+    /* Restore Registers from Stack */
+    LOAD x1,  1  * REGBYTES(sp)    /* RA */
+    LOAD x5,  2  * REGBYTES(sp)
+    LOAD x6,  3  * REGBYTES(sp)
+    LOAD x7,  4  * REGBYTES(sp)
+    LOAD x8,  5  * REGBYTES(sp)
+    LOAD x9,  6  * REGBYTES(sp)
+    LOAD x10, 7  * REGBYTES(sp)
+    LOAD x11, 8  * REGBYTES(sp)
+    LOAD x12, 9  * REGBYTES(sp)
+    LOAD x13, 10 * REGBYTES(sp)
+    LOAD x14, 11 * REGBYTES(sp)
+    LOAD x15, 12 * REGBYTES(sp)
+#ifndef __riscv_32e
+    LOAD x16, 13 * REGBYTES(sp)
+    LOAD x17, 14 * REGBYTES(sp)
+    LOAD x18, 15 * REGBYTES(sp)
+    LOAD x19, 16 * REGBYTES(sp)
+    LOAD x20, 17 * REGBYTES(sp)
+    LOAD x21, 18 * REGBYTES(sp)
+    LOAD x22, 19 * REGBYTES(sp)
+    LOAD x23, 20 * REGBYTES(sp)
+    LOAD x24, 21 * REGBYTES(sp)
+    LOAD x25, 22 * REGBYTES(sp)
+    LOAD x26, 23 * REGBYTES(sp)
+    LOAD x27, 24 * REGBYTES(sp)
+    LOAD x28, 25 * REGBYTES(sp)
+    LOAD x29, 26 * REGBYTES(sp)
+    LOAD x30, 27 * REGBYTES(sp)
+    LOAD x31, 28 * REGBYTES(sp)
+#endif
+
+    addi sp, sp, portCONTEXT_SIZE
+    mret
+
+#endif
\ No newline at end of file
diff --git a/arch/riscv/exceptions.c b/arch/riscv/exceptions.c
index f8fd67733..7e3d55a13 100644
--- a/arch/riscv/exceptions.c
+++ b/arch/riscv/exceptions.c
@@ -13,6 +13,7 @@
 
 #define LOCAL_TRACE 0
 
+#ifndef RISCV_VARIANT_NUCLEI
 // keep in sync with asm.S
 struct riscv_short_iframe {
     ulong  epc;
@@ -118,3 +119,16 @@ void riscv_exception_handler(long cause, ulong epc, struct riscv_short_iframe *f
         thread_preempt();
     }
 }
+#else
+extern volatile unsigned long riscv_reschedule;
+extern volatile unsigned long rt_preemt_flag;
+void riscv_irq_exit(void)
+{
+    if (riscv_reschedule != INT_NO_RESCHEDULE) {
+        riscv_reschedule = INT_NO_RESCHEDULE;
+        rt_preemt_flag = 1;
+        thread_preempt();
+        rt_preemt_flag = 0;
+    }
+}
+#endif
diff --git a/arch/riscv/include/arch/arch_thread.h b/arch/riscv/include/arch/arch_thread.h
index 78291792f..6a445ac91 100644
--- a/arch/riscv/include/arch/arch_thread.h
+++ b/arch/riscv/include/arch/arch_thread.h
@@ -9,6 +9,45 @@
 
 #include <sys/types.h>
 
+#ifdef RISCV_VARIANT_NUCLEI
+struct riscv_context_switch_frame {
+    unsigned long epc;        /* epc - epc    - program counter                     */
+    unsigned long ra;         /* x1  - ra     - return address for jumps            */
+    unsigned long t0;         /* x5  - t0     - temporary register 0                */
+    unsigned long t1;         /* x6  - t1     - temporary register 1                */
+    unsigned long t2;         /* x7  - t2     - temporary register 2                */
+    unsigned long s0_fp;      /* x8  - s0/fp  - saved register 0 or frame pointer   */
+    unsigned long s1;         /* x9  - s1     - saved register 1                    */
+    unsigned long a0;         /* x10 - a0     - return value or function argument 0 */
+    unsigned long a1;         /* x11 - a1     - return value or function argument 1 */
+    unsigned long a2;         /* x12 - a2     - function argument 2                 */
+    unsigned long a3;         /* x13 - a3     - function argument 3                 */
+    unsigned long a4;         /* x14 - a4     - function argument 4                 */
+    unsigned long a5;         /* x15 - a5     - function argument 5                 */
+#ifndef __riscv_32e
+    unsigned long a6;         /* x16 - a6     - function argument 6                 */
+    unsigned long a7;         /* x17 - s7     - function argument 7                 */
+    unsigned long s2;         /* x18 - s2     - saved register 2                    */
+    unsigned long s3;         /* x19 - s3     - saved register 3                    */
+    unsigned long s4;         /* x20 - s4     - saved register 4                    */
+    unsigned long s5;         /* x21 - s5     - saved register 5                    */
+    unsigned long s6;         /* x22 - s6     - saved register 6                    */
+    unsigned long s7;         /* x23 - s7     - saved register 7                    */
+    unsigned long s8;         /* x24 - s8     - saved register 8                    */
+    unsigned long s9;         /* x25 - s9     - saved register 9                    */
+    unsigned long s10;        /* x26 - s10    - saved register 10                   */
+    unsigned long s11;        /* x27 - s11    - saved register 11                   */
+    unsigned long t3;         /* x28 - t3     - temporary register 3                */
+    unsigned long t4;         /* x29 - t4     - temporary register 4                */
+    unsigned long t5;         /* x30 - t5     - temporary register 5                */
+    unsigned long t6;         /* x31 - t6     - temporary register 6                */
+#endif
+    unsigned long mstatus;    /*              - machine status register             */
+};
+struct arch_thread {
+    struct riscv_context_switch_frame *cs_frame;
+};
+#else
 struct riscv_context_switch_frame {
     unsigned long ra; // return address (x1)
     unsigned long sp; // stack pointer (x2)
@@ -31,6 +70,7 @@ struct riscv_context_switch_frame {
 struct arch_thread {
     struct riscv_context_switch_frame cs_frame;
 };
+#endif
 
 void riscv_context_switch(struct riscv_context_switch_frame *oldcs,
                           struct riscv_context_switch_frame *newcs);
diff --git a/arch/riscv/include/arch/riscv.h b/arch/riscv/include/arch/riscv.h
index 7bda2f20b..44dda4bbb 100644
--- a/arch/riscv/include/arch/riscv.h
+++ b/arch/riscv/include/arch/riscv.h
@@ -170,6 +170,11 @@ static inline uint riscv_current_hart(void) {
 
 void riscv_set_secondary_count(int count);
 
+#ifdef RISCV_VARIANT_NUCLEI
+void riscv_clic_irq_entry(void);
+void riscv_clic_irq_exit(bool reschedule);
+#endif /* RISCV_VARIANT_NUCLEI */
+
 void riscv_exception_entry(void);
 enum handler_return riscv_timer_exception(void);
 
diff --git a/arch/riscv/linker-twosegment.ld b/arch/riscv/linker-twosegment.ld
index b2eb2ac39..b81033243 100644
--- a/arch/riscv/linker-twosegment.ld
+++ b/arch/riscv/linker-twosegment.ld
@@ -18,12 +18,13 @@ SECTIONS
 {
     . = %ROMBASE%;
 
-    _start = .;
     __rom_start = .;
 
     /* text/read-only data */
     /* set the load address to physical MEMBASE */
     .text : {
+        KEEP(*(.text.boot.vectab1))
+        KEEP(*(.text.boot.vectab2))
         KEEP(*(.text.boot))
         *(.text .text*)
         *(.gnu.linkonce.t.*)
@@ -61,17 +62,14 @@ SECTIONS
         __dtor_end = .;
         *(.got*)
         *(.dynamic)
-    } :data
-
-    /* Try to put sdata and sbss near each other by putting sdata at the end of the data segment
-     * and sbss at the start of the bss segment. This maximizes reach of things referenced off of
-     * the global pointer. */
-    .sdata : {
+        . = ALIGN(8);
         __global_pointer$ = . + (4K / 2);
         /* Question: should we put srodata here on multi seg binaries? */
         *(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*)
         *(.sdata .sdata.* .gnu.linkonce.s.*)
-    }
+    } :data
+
+
 
     . = ALIGN(%BITS% / 8);
     __data_end = .;
diff --git a/arch/riscv/rules.mk b/arch/riscv/rules.mk
index 809f348d8..3f93eebde 100644
--- a/arch/riscv/rules.mk
+++ b/arch/riscv/rules.mk
@@ -30,8 +30,20 @@ endif
 
 SUBARCH ?= 32
 
+# Different vendor variant of riscv
+VARIANT ?=
+
 RISCV_MODE ?= machine
 
+ifeq ($(VARIANT),nuclei)
+MODULE_DEPS += \
+	arch/riscv/nuclei/NMSIS
+
+MODULE_SRCS += $(LOCAL_DIR)/vectab.S
+
+GLOBAL_DEFINES += RISCV_VARIANT_NUCLEI=1
+endif
+
 ifeq ($(strip $(RISCV_MODE)),machine)
 $(info RISCV: Machine Mode)
 GLOBAL_DEFINES += RISCV_M_MODE=1
@@ -154,6 +166,7 @@ WITH_LINKER_GC ?= 0
 endif
 
 LIBGCC := $(shell $(TOOLCHAIN_PREFIX)gcc $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(GLOBAL_CFLAGS) -print-libgcc-file-name)
+LIBGCC += $(shell $(TOOLCHAIN_PREFIX)gcc $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(GLOBAL_CFLAGS) -print-file-name=libstdc++.a)
 $(info LIBGCC = $(LIBGCC))
 
 # potentially generated files that should be cleaned out with clean make rule
diff --git a/arch/riscv/start.S b/arch/riscv/start.S
index 7ffe777b0..3416775e9 100644
--- a/arch/riscv/start.S
+++ b/arch/riscv/start.S
@@ -218,9 +218,9 @@ END_FUNCTION(_mmu_init)
 
 .bss
 .align 4
-LOCAL_DATA(default_stack)
+DATA(default_stack)
     .skip ARCH_DEFAULT_STACK_SIZE * RISCV_MAX_HARTS;
-LOCAL_DATA(default_stack_top)
+DATA(default_stack_top)
 
 // put boot status in .data so it doesn't get paved over during BSS initialization
 .data
diff --git a/arch/riscv/thread.c b/arch/riscv/thread.c
index 15715c9cc..2014237ed 100644
--- a/arch/riscv/thread.c
+++ b/arch/riscv/thread.c
@@ -5,6 +5,9 @@
  * license that can be found in the LICENSE file or at
  * https://opensource.org/licenses/MIT
  */
+#ifdef RISCV_VARIANT_NUCLEI
+#include <nuclei_sdk_soc.h>
+#endif
 #include <assert.h>
 #include <lk/debug.h>
 #include <lk/trace.h>
@@ -16,6 +19,19 @@
 
 #define LOCAL_TRACE 0
 
+#ifdef RISCV_VARIANT_NUCLEI
+#define portINITIAL_MSTATUS ( MSTATUS_MPP | MSTATUS_FS_INITIAL)
+extern void arch_context_start(unsigned long sp);
+static void riscv_idle_thread(void);
+static void riscv_trigger_preempt(void);
+volatile unsigned long  rt_interrupt_from_thread = 0;
+volatile unsigned long  rt_interrupt_to_thread   = 0;
+volatile unsigned long rt_realswitch_flag = 0;
+volatile unsigned long rt_preemt_flag = 0;
+
+static uint8_t initial_stack[1024] __SECTION(".bss.prebss.initial_stack") __ALIGNED(8);
+#endif
+
 struct thread *_current_thread;
 
 static void initial_thread_func(void) __NO_RETURN;
@@ -32,7 +48,7 @@ static void initial_thread_func(void) {
     /* release the thread lock that was implicitly held across the reschedule */
     spin_unlock(&thread_lock);
     arch_enable_ints();
-
+    
     int ret = ct->entry(ct->arg);
 
     LTRACEF("thread %p exiting with %d\n", ct, ret);
@@ -42,14 +58,22 @@ static void initial_thread_func(void) {
 
 void arch_thread_initialize(thread_t *t) {
     /* zero out the thread context */
-    memset(&t->arch.cs_frame, 0, sizeof(t->arch.cs_frame));
 
     /* make sure the top of the stack is 16 byte aligned */
     vaddr_t stack_top = ROUNDDOWN((vaddr_t)t->stack + t->stack_size, 16);
-
+#ifndef RISCV_VARIANT_NUCLEI
+    memset(&t->arch.cs_frame, 0, sizeof(t->arch.cs_frame));
     t->arch.cs_frame.sp = stack_top;
     t->arch.cs_frame.ra = (vaddr_t)&initial_thread_func;
-
+#else
+    extern void arch_idle(void);
+    stack_top -= sizeof(struct riscv_context_switch_frame);
+    t->arch.cs_frame = (struct riscv_context_switch_frame *)stack_top;
+    t->arch.cs_frame->ra = (unsigned long)&arch_idle;
+    t->arch.cs_frame->a0 = (unsigned long)t->arg;
+    t->arch.cs_frame->epc = (unsigned long)&initial_thread_func;
+    t->arch.cs_frame->mstatus = (unsigned long)portINITIAL_MSTATUS;
+#endif
     LTRACEF("t %p (%s) stack top %#lx entry %p arg %p\n", t, t->name, stack_top, t->entry, t->arg);
 }
 
@@ -57,14 +81,62 @@ void arch_context_switch(thread_t *oldthread, thread_t *newthread) {
     DEBUG_ASSERT(arch_ints_disabled());
 
     LTRACEF("old %p (%s), new %p (%s)\n", oldthread, oldthread->name, newthread, newthread->name);
-
+#ifdef RISCV_VARIANT_NUCLEI
+    LTRACEF("old %s (sp %p), new %s (sp %p)\n", oldthread->name, oldthread->arch.cs_frame, newthread->name, newthread->arch.cs_frame);
+    if (oldthread->stack_size > 0) {
+        if (newthread->stack_size == 0) {
+            newthread->stack = initial_stack;
+            newthread->stack_size = sizeof(initial_stack);
+            newthread->entry= riscv_idle_thread;
+            arch_thread_initialize(newthread);
+        }
+        if (rt_realswitch_flag == 0) {
+            rt_interrupt_from_thread = &(oldthread->arch.cs_frame);
+        }
+        rt_realswitch_flag = 1;
+        rt_interrupt_to_thread = &(newthread->arch.cs_frame);
+        LTRACEF("int %d, from pc %p, to pc %p\n", rt_realswitch_flag, oldthread->arch.cs_frame->epc, newthread->arch.cs_frame->epc);
+        riscv_trigger_preempt();
+        if (rt_preemt_flag == 0) {
+            spin_unlock(&thread_lock);
+            arch_enable_ints();
+        }
+    } else { // First task started
+        arch_context_start((unsigned long)&(newthread->arch.cs_frame));
+        // never return
+    }
+    
+#else
     riscv_context_switch(&oldthread->arch.cs_frame, &newthread->arch.cs_frame);
+#endif
 }
 
 void arch_dump_thread(thread_t *t) {
     if (t->state != THREAD_RUNNING) {
         dprintf(INFO, "\tarch: ");
+#ifdef RISCV_VARIANT_NUCLEI
+        dprintf(INFO, "sp %#lx\n", t->arch.cs_frame);
+#else
         dprintf(INFO, "sp %#lx\n", t->arch.cs_frame.sp);
+#endif
     }
 }
 
+#ifdef RISCV_VARIANT_NUCLEI
+void riscv_msip_process(void) {
+    /* Clear Software IRQ, A MUST */
+    SysTimer_ClearSWIRQ();
+    /* Clear the real switch flag */
+    rt_realswitch_flag = 0;
+}
+
+static void riscv_trigger_preempt(void) {
+    /* Set a software interrupt(SWI) request to request a context switch. */
+    SysTimer_SetSWIRQ();
+    __RWMB();
+}
+static void riscv_idle_thread(void) {
+    for (;;)
+        arch_idle();
+}
+#endif
\ No newline at end of file
diff --git a/arch/riscv/time.c b/arch/riscv/time.c
index 2e78b93b5..f1b0adf4a 100644
--- a/arch/riscv/time.c
+++ b/arch/riscv/time.c
@@ -5,6 +5,9 @@
  * license that can be found in the LICENSE file or at
  * https://opensource.org/licenses/MIT
  */
+#ifdef RISCV_VARIANT_NUCLEI
+#include <nuclei_sdk_soc.h>
+#endif
 #include <lk/reg.h>
 #include <lk/debug.h>
 #include <lk/trace.h>
@@ -22,6 +25,75 @@
 static platform_timer_callback timer_cb;
 static void *timer_arg;
 
+#ifdef RISCV_VARIANT_NUCLEI
+#define configKERNEL_INTERRUPT_PRIORITY     0
+status_t platform_set_oneshot_timer (platform_timer_callback callback, void *arg, lk_time_t interval) {
+    LTRACEF("cb %p, arg %p, interval %u\n", callback, arg, interval);
+
+    // disable timer irq
+    ECLIC_DisableIRQ(SysTimer_IRQn);
+
+    timer_cb = callback;
+    timer_arg = arg;
+
+    // enable the timer irq
+    ECLIC_EnableIRQ(SysTimer_IRQn);
+
+    // convert interval to ticks
+
+    uint64_t ticks = ((interval * ARCH_RISCV_MTIME_RATE) / 1000u);
+    SysTick_Reload(ticks);
+
+    return NO_ERROR;
+}
+
+lk_bigtime_t current_time_hires(void) {
+#if ARCH_RISCV_MTIME_RATE < 10000000
+    return current_time() * 1000llu; // hack to deal with slow clocks
+#else
+    return SysTimer_GetLoadValue() / (ARCH_RISCV_MTIME_RATE / 1000000u);
+#endif
+}
+
+lk_time_t current_time(void) {
+    return SysTimer_GetLoadValue() / (ARCH_RISCV_MTIME_RATE / 1000u);
+}
+
+void platform_stop_timer(void) {
+    ECLIC_DisableIRQ(SysTimer_IRQn);
+}
+
+void platform_init_timer(void)
+{
+    ECLIC_DisableIRQ(SysTimer_IRQn);
+    ECLIC_SetLevelIRQ(SysTimer_IRQn, configKERNEL_INTERRUPT_PRIORITY);
+    ECLIC_SetShvIRQ(SysTimer_IRQn, ECLIC_NON_VECTOR_INTERRUPT);
+
+    /* Set SWI interrupt level to lowest level/priority, SysTimerSW as Vector Interrupt */
+    ECLIC_SetShvIRQ(SysTimerSW_IRQn, ECLIC_VECTOR_INTERRUPT);
+    ECLIC_SetLevelIRQ(SysTimerSW_IRQn, configKERNEL_INTERRUPT_PRIORITY);
+    SysTimer_ClearSWIRQ();
+    ECLIC_EnableIRQ(SysTimerSW_IRQn);
+}
+
+enum handler_return riscv_mtip_handler(void) {
+    LTRACEF("tick\n");
+
+    ECLIC_DisableIRQ(SysTimer_IRQn);
+
+    enum handler_return ret = INT_NO_RESCHEDULE;
+    unsigned long state;
+    state = riscv_csr_read_clear(RISCV_CSR_XSTATUS, RISCV_CSR_XSTATUS_IE) & RISCV_CSR_XSTATUS_IE;
+    if (timer_cb) {
+        ret = timer_cb(timer_arg, current_time());
+    }
+    riscv_clic_irq_exit(ret);
+    riscv_csr_set(RISCV_CSR_XSTATUS, state);
+
+    return ret;
+}
+
+#else
 status_t platform_set_oneshot_timer (platform_timer_callback callback, void *arg, lk_time_t interval) {
     LTRACEF("cb %p, arg %p, interval %u\n", callback, arg, interval);
 
@@ -75,3 +147,4 @@ enum handler_return riscv_timer_exception(void) {
 
     return ret;
 }
+#endif
\ No newline at end of file
diff --git a/arch/riscv/vectab.S b/arch/riscv/vectab.S
new file mode 100644
index 000000000..b5cf8f3a7
--- /dev/null
+++ b/arch/riscv/vectab.S
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Nuclei System Technology
+ *
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file or at
+ * https://opensource.org/licenses/MIT
+ */
+#ifdef RISCV_VARIANT_NUCLEI
+
+#include "riscv_encoding.h"
+
+.macro DECLARE_INT_HANDLER  INT_HDL_NAME
+#if defined(__riscv_xlen) && (__riscv_xlen == 32)
+    .word \INT_HDL_NAME
+#else
+    .dword \INT_HDL_NAME
+#endif
+.endm
+
+    .section .text.boot.vectab1
+
+    .weak  riscv_msip_handler
+    .weak  riscv_mtip_handler
+
+    .global vectab
+vectab:
+    j _start                                                /* 0: Reserved, Jump to _start when reset for ILM/FlashXIP mode.*/
+    .align LOG_REGBYTES                                     /*    Need to align 4 byte for RV32, 8 Byte for RV64 */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 1: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 2: Reserved */
+    DECLARE_INT_HANDLER     riscv_msip_handler              /* 3: Machine software interrupt */
+
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 4: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 5: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 6: Reserved */
+    DECLARE_INT_HANDLER     riscv_mtip_handler              /* 7: Machine timer interrupt */
+
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 8: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 9: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 10: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 11: Reserved */
+
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 12: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 13: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 14: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 15: Reserved */
+
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 16: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 17: Reserved */
+    DECLARE_INT_HANDLER     default_intexc_handler          /* 18: Reserved */
+
+#endif
\ No newline at end of file
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/core_compatiable.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_compatiable.h
new file mode 100644
index 000000000..40a9198e1
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_compatiable.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_COMPATIABLE_H__
+#define __CORE_COMPATIABLE_H__
+/*!
+ * @file     core_compatiable.h
+ * @brief    ARM compatiable function definitions header file
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* ===== ARM Compatiable Functions ===== */
+/**
+ * \defgroup NMSIS_Core_ARMCompatiable_Functions   ARM Compatiable Functions
+ * \ingroup  NMSIS_Core
+ * \brief    A few functions that compatiable with ARM CMSIS-Core.
+ * \details
+ *
+ * Here we provided a few functions that compatiable with ARM CMSIS-Core,
+ * mostly used in the DSP and NN library.
+ * @{
+ */
+/** \brief Instruction Synchronization Barrier, compatiable with ARM */
+#define __ISB()                             __RWMB()
+
+/** \brief Data Synchronization Barrier, compatiable with ARM */
+#define __DSB()                             __RWMB()
+
+/** \brief Data Memory Barrier, compatiable with ARM */
+#define __DMB()                             __RWMB()
+
+/** \brief LDRT Unprivileged (8 bit), ARM Compatiable */
+#define __LDRBT(ptr)                        __LB((ptr))
+/** \brief LDRT Unprivileged (16 bit), ARM Compatiable */
+#define __LDRHT(ptr)                        __LH((ptr))
+/** \brief LDRT Unprivileged (32 bit), ARM Compatiable */
+#define __LDRT(ptr)                         __LW((ptr))
+
+/** \brief STRT Unprivileged (8 bit), ARM Compatiable */
+#define __STRBT(ptr)                        __SB((ptr))
+/** \brief STRT Unprivileged (16 bit), ARM Compatiable */
+#define __STRHT(ptr)                        __SH((ptr))
+/** \brief STRT Unprivileged (32 bit), ARM Compatiable */
+#define __STRT(ptr)                         __SW((ptr))
+
+/* ===== Saturation Operations ===== */
+/**
+ * \brief   Signed Saturate
+ * \details Saturates a signed value.
+ * \param [in]  value  Value to be saturated
+ * \param [in]    sat  Bit position to saturate to (1..32)
+ * \return             Saturated value
+ */
+#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)
+#define __SSAT(val, sat)          __RV_SCLIP32((val), (sat-1))
+#else
+__STATIC_FORCEINLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+    if ((sat >= 1U) && (sat <= 32U)) {
+        const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+        const int32_t min = -1 - max ;
+        if (val > max) {
+            return max;
+        } else if (val < min) {
+            return min;
+        }
+    }
+    return val;
+}
+#endif
+
+/**
+ * \brief   Unsigned Saturate
+ * \details Saturates an unsigned value.
+ * \param [in]  value  Value to be saturated
+ * \param [in]    sat  Bit position to saturate to (0..31)
+ * \return             Saturated value
+ */
+#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)
+#define __USAT(val, sat)        __RV_UCLIP32((val), (sat-1))
+#else
+__STATIC_FORCEINLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+    if (sat <= 31U) {
+        const uint32_t max = ((1U << sat) - 1U);
+        if (val > (int32_t)max) {
+            return max;
+        } else if (val < 0) {
+            return 0U;
+        }
+    }
+    return (uint32_t)val;
+}
+#endif
+
+/* ===== Data Processing Operations ===== */
+/**
+ * \brief   Reverse byte order (32 bit)
+ * \details Reverses the byte order in unsigned integer value.
+ * For example, 0x12345678 becomes 0x78563412.
+ * \param [in]    value  Value to reverse
+ * \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __REV(uint32_t value)
+{
+    uint32_t result;
+
+    result =  ((value & 0xff000000) >> 24)
+        | ((value & 0x00ff0000) >> 8 )
+        | ((value & 0x0000ff00) << 8 )
+        | ((value & 0x000000ff) << 24);
+    return result;
+}
+
+/**
+ * \brief   Reverse byte order (16 bit)
+ * \details Reverses the byte order within each halfword of a word.
+ * For example, 0x12345678 becomes 0x34127856.
+ * \param [in]    value  Value to reverse
+ * \return               Reversed value
+ */
+__STATIC_FORCEINLINE uint32_t __REV16(uint32_t value)
+{
+    uint32_t result;
+    result =  ((value & 0xff000000) >> 8)
+        | ((value & 0x00ff00000) << 8 )
+        | ((value & 0x0000ff00) >> 8 )
+        | ((value & 0x000000ff) << 8) ;
+
+    return result;
+}
+
+/**
+ * \brief   Reverse byte order (16 bit)
+ * \details Reverses the byte order in a 16-bit value
+ * and returns the signed 16-bit result.
+ * For example, 0x0080 becomes 0x8000.
+ * \param [in]    value  Value to reverse
+ * \return               Reversed value
+ */
+__STATIC_FORCEINLINE int16_t __REVSH(int16_t value)
+{
+    int16_t result;
+    result = ((value & 0xff00) >> 8) | ((value & 0x00ff) << 8);
+    return result;
+}
+
+/**
+ * \brief   Rotate Right in unsigned value (32 bit)
+ * \details Rotate Right (immediate) provides the value of
+ * the contents of a register rotated by a variable number of bits.
+ * \param [in]    op1  Value to rotate
+ * \param [in]    op2  Number of Bits to rotate(0-31)
+ * \return               Rotated value
+ */
+__STATIC_FORCEINLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+    op2 = op2 & 0x1F;
+    if (op2 == 0U) {
+      return op1;
+    }
+    return (op1 >> op2) | (op1 << (32U - op2));
+}
+
+/**
+ * \brief   Reverse bit order of value
+ * \details Reverses the bit order of the given value.
+ * \param [in]    value  Value to reverse
+ * \return               Reversed value
+ */
+#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)
+#define __RBIT(value)           __RV_BITREVI((value), 31)
+#else
+__STATIC_FORCEINLINE uint32_t __RBIT(uint32_t value)
+{
+    uint32_t result;
+    uint32_t s = (4U /*sizeof(v)*/ * 8U) - 1U; /* extra shift needed at end */
+
+    result = value; /* r will be reversed bits of v; first get LSB of v */
+    for (value >>= 1U; value != 0U; value >>= 1U) {
+        result <<= 1U;
+        result |= value & 1U;
+        s--;
+    }
+    result <<= s; /* shift when v's highest bits are zero */
+    return result;
+}
+#endif /* defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) */
+
+/**
+ * \brief   Count leading zeros
+ * \details Counts the number of leading zeros of a data value.
+ * \param [in]  data  Value to count the leading zeros
+ * \return             number of leading zeros in value
+ */
+#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)
+#define __CLZ(data)         __RV_CLZ32(data)
+#else
+__STATIC_FORCEINLINE uint8_t __CLZ(uint32_t data)
+{
+    uint8_t ret = 0;
+    uint32_t temp = ~data;
+    while (temp & 0x80000000) {
+          temp <<= 1;
+          ret++;
+    }
+    return ret;
+}
+#endif /* defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) */
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_ARMCompatiable_Functions */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __CORE_COMPATIABLE_H__ */
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_base.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_base.h
new file mode 100644
index 000000000..5f351a336
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_base.h
@@ -0,0 +1,1177 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CORE_FEATURE_BASE__
+#define __CORE_FEATURE_BASE__
+/*!
+ * @file     core_feature_base.h
+ * @brief    Base core feature API for Nuclei N/NX Core
+ */
+#include <stdint.h>
+#include "riscv_encoding.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/**
+ * \defgroup NMSIS_Core_Registers     Register Define and Type Definitions
+ * \brief   Type definitions and defines for core registers.
+ *
+ * @{
+ */
+#ifndef __RISCV_XLEN
+  /** \brief Refer to the width of an integer register in bits(either 32 or 64) */
+  #ifndef __riscv_xlen
+    #define __RISCV_XLEN    32
+  #else
+    #define __RISCV_XLEN    __riscv_xlen
+  #endif
+#endif /* __RISCV_XLEN */
+
+/** \brief Type of Control and Status Register(CSR), depends on the XLEN defined in RISC-V */
+#if __RISCV_XLEN == 32
+  typedef uint32_t rv_csr_t;
+#elif __RISCV_XLEN == 64
+  typedef uint64_t rv_csr_t;
+#else
+  typedef uint32_t rv_csr_t;
+#endif
+/** @} */ /* End of Doxygen Group NMSIS_Core_Registers */
+/**
+ * \defgroup NMSIS_Core_Base_Registers     Base Register Define and Type Definitions
+ * \ingroup NMSIS_Core_Registers
+ * \brief   Type definitions and defines for base core registers.
+ *
+ * @{
+ */
+/**
+ * \brief  Union type to access MISA register.
+ */
+typedef union {
+    struct {
+        rv_csr_t a:1;                           /*!< bit:     0  Atomic extension */
+        rv_csr_t b:1;                           /*!< bit:     1  Tentatively reserved for Bit-Manipulation extension */
+        rv_csr_t c:1;                           /*!< bit:     2  Compressed extension */
+        rv_csr_t d:1;                           /*!< bit:     3  Double-precision floating-point extension */
+        rv_csr_t e:1;                           /*!< bit:     4  RV32E base ISA */
+        rv_csr_t f:1;                           /*!< bit:     5  Single-precision floating-point extension */
+        rv_csr_t g:1;                           /*!< bit:     6  Additional standard extensions present */
+        rv_csr_t h:1;                           /*!< bit:     7  Hypervisor extension */
+        rv_csr_t i:1;                           /*!< bit:     8  RV32I/64I/128I base ISA */
+        rv_csr_t j:1;                           /*!< bit:     9  Tentatively reserved for Dynamically Translated Languages extension */
+        rv_csr_t _reserved1:1;                  /*!< bit:     10 Reserved  */
+        rv_csr_t l:1;                           /*!< bit:     11 Tentatively reserved for Decimal Floating-Point extension  */
+        rv_csr_t m:1;                           /*!< bit:     12 Integer Multiply/Divide extension */
+        rv_csr_t n:1;                           /*!< bit:     13 User-level interrupts supported  */
+        rv_csr_t _reserved2:1;                  /*!< bit:     14 Reserved  */
+        rv_csr_t p:1;                           /*!< bit:     15 Tentatively reserved for Packed-SIMD extension  */
+        rv_csr_t q:1;                           /*!< bit:     16 Quad-precision floating-point extension  */
+        rv_csr_t _resreved3:1;                  /*!< bit:     17 Reserved  */
+        rv_csr_t s:1;                           /*!< bit:     18 Supervisor mode implemented  */
+        rv_csr_t t:1;                           /*!< bit:     19 Tentatively reserved for Transactional Memory extension  */
+        rv_csr_t u:1;                           /*!< bit:     20 User mode implemented  */
+        rv_csr_t v:1;                           /*!< bit:     21 Tentatively reserved for Vector extension  */
+        rv_csr_t _reserved4:1;                  /*!< bit:     22 Reserved  */
+        rv_csr_t x:1;                           /*!< bit:     23 Non-standard extensions present  */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved5:38;                 /*!< bit:     24..61 Reserved  */
+        rv_csr_t mxl:2;                         /*!< bit:     62..63 Machine XLEN  */
+#else
+        rv_csr_t _reserved5:6;                  /*!< bit:     24..29 Reserved  */
+        rv_csr_t mxl:2;                         /*!< bit:     30..31 Machine XLEN  */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MISA_Type;
+
+/**
+ * \brief  Union type to access MSTATUS configure register.
+ */
+typedef union {
+    struct {
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved0:3;                  /*!< bit:     0..2  Reserved */
+        rv_csr_t mie:1;                         /*!< bit:     3  Machine mode interrupt enable flag */
+        rv_csr_t _reserved1:3;                  /*!< bit:     4..6  Reserved */
+        rv_csr_t mpie:1;                        /*!< bit:     7  mirror of MIE flag */
+        rv_csr_t _reserved2:3;                  /*!< bit:     8..10  Reserved */
+        rv_csr_t mpp:2;                         /*!< bit:     11..12 mirror of Privilege Mode */
+        rv_csr_t fs:2;                          /*!< bit:     13..14 FS status flag */
+        rv_csr_t xs:2;                          /*!< bit:     15..16 XS status flag */
+        rv_csr_t mprv:1;                        /*!< bit:     Machine mode PMP */
+        rv_csr_t _reserved3:14;                 /*!< bit:     18..31 Reserved */
+        rv_csr_t uxl:2;                         /*!< bit:     32..33 user mode xlen */
+        rv_csr_t _reserved6:29;                 /*!< bit:     34..62 Reserved  */
+        rv_csr_t sd:1;                          /*!< bit:     Dirty status for XS or FS */
+#else
+        rv_csr_t _reserved0:1;                  /*!< bit:     0  Reserved */
+        rv_csr_t sie:1;                         /*!< bit:     1  supervisor interrupt enable flag */
+        rv_csr_t _reserved1:1;                  /*!< bit:     2  Reserved */
+        rv_csr_t mie:1;                         /*!< bit:     3  Machine mode interrupt enable flag */
+        rv_csr_t _reserved2:1;                  /*!< bit:     4  Reserved */
+        rv_csr_t spie:1;                        /*!< bit:     3  Supervisor Privilede mode interrupt enable flag */
+        rv_csr_t _reserved3:1;                  /*!< bit:     Reserved */
+        rv_csr_t mpie:1;                        /*!< bit:     mirror of MIE flag */
+        rv_csr_t _reserved4:3;                  /*!< bit:     Reserved */
+        rv_csr_t mpp:2;                         /*!< bit:     mirror of Privilege Mode */
+        rv_csr_t fs:2;                          /*!< bit:     FS status flag */
+        rv_csr_t xs:2;                          /*!< bit:     XS status flag */
+        rv_csr_t mprv:1;                        /*!< bit:     Machine mode PMP */
+        rv_csr_t sum:1;                         /*!< bit:     Supervisor Mode load and store protection */
+        rv_csr_t _reserved6:12;                 /*!< bit:     19..30 Reserved  */
+        rv_csr_t sd:1;                          /*!< bit:     Dirty status for XS or FS */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MSTATUS_Type;
+
+/**
+ * \brief  Union type to access MTVEC configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t mode:6;                        /*!< bit:     0..5   interrupt mode control */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t addr:58;                       /*!< bit:     6..63  mtvec address */
+#else
+        rv_csr_t addr:26;                       /*!< bit:     6..31  mtvec address */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MTVEC_Type;
+
+/**
+ * \brief  Union type to access MCAUSE configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t exccode:12;                    /*!< bit:     11..0  exception or interrupt code */
+        rv_csr_t _reserved0:4;                  /*!< bit:     15..12  Reserved */
+        rv_csr_t mpil:8;                        /*!< bit:     23..16  Previous interrupt level */
+        rv_csr_t _reserved1:3;                  /*!< bit:     26..24  Reserved */
+        rv_csr_t mpie:1;                        /*!< bit:     27  Interrupt enable flag before enter interrupt */
+        rv_csr_t mpp:2;                         /*!< bit:     29..28  Privilede mode flag before enter interrupt */
+        rv_csr_t minhv:1;                       /*!< bit:     30  Machine interrupt vector table */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved2:32;                 /*!< bit:     31..62  Reserved */
+        rv_csr_t interrupt:1;                   /*!< bit:     63  trap type. 0 means exception and 1 means interrupt */
+#else
+        rv_csr_t interrupt:1;                   /*!< bit:     31  trap type. 0 means exception and 1 means interrupt */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MCAUSE_Type;
+
+/**
+ * \brief  Union type to access MCOUNTINHIBIT configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t cy:1;                          /*!< bit:     0     1 means disable mcycle counter */
+        rv_csr_t _reserved0:1;                  /*!< bit:     1     Reserved */
+        rv_csr_t ir:1;                          /*!< bit:     2     1 means disable minstret counter */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved1:61;                 /*!< bit:     3..63 Reserved */
+#else
+        rv_csr_t _reserved1:29;                 /*!< bit:     3..31 Reserved */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MCOUNTINHIBIT_Type;
+
+/**
+ * \brief  Union type to access msubm configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t _reserved0:6;                  /*!< bit:     0..5   Reserved */
+        rv_csr_t typ:2;                         /*!< bit:     6..7   current trap type */
+        rv_csr_t ptyp:2;                        /*!< bit:     8..9   previous trap type */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved1:54;                 /*!< bit:     10..63 Reserved */
+#else
+        rv_csr_t _reserved1:22;                 /*!< bit:     10..31 Reserved */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MSUBM_Type;
+
+/**
+ * \brief  Union type to access MMISC_CTRL configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t _reserved0:3;                  /*!< bit:     0..2  Reserved */
+        rv_csr_t bpu:1;                         /*!< bit:     3     dynamic prediction enable flag */
+        rv_csr_t _reserved1:2;                  /*!< bit:     4..5  Reserved */
+        rv_csr_t misalign:1;                    /*!< bit:     6     misaligned access support flag */
+        rv_csr_t _reserved2:2;                  /*!< bit:     7..8  Reserved */
+        rv_csr_t nmi_cause:1;                   /*!< bit:     9     mnvec control and nmi mcase exccode */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved3:54;                 /*!< bit:     10..63 Reserved */
+#else
+        rv_csr_t _reserved3:22;                 /*!< bit:     10..31 Reserved */
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t d;                                 /*!< Type      used for csr data access */
+} CSR_MMISCCTRL_Type;
+
+
+/**
+ * \brief  Union type to access MSAVESTATUS configure register.
+ */
+typedef union {
+    struct {
+        rv_csr_t mpie1:1;                       /*!< bit:     0     interrupt enable flag of fisrt level NMI/exception nestting */
+        rv_csr_t mpp1:2;                        /*!< bit:     1..2  privilede mode of fisrt level NMI/exception nestting */
+        rv_csr_t _reserved0:3;                  /*!< bit:     3..5  Reserved */
+        rv_csr_t ptyp1:2;                       /*!< bit:     6..7  NMI/exception type of before first nestting */
+        rv_csr_t mpie2:1;                       /*!< bit:     8     interrupt enable flag of second level NMI/exception nestting */
+        rv_csr_t mpp2:2;                        /*!< bit:     9..10 privilede mode of second level NMI/exception nestting */
+        rv_csr_t _reserved1:3;                  /*!< bit:     11..13     Reserved */
+        rv_csr_t ptyp2:2;                       /*!< bit:     14..15     NMI/exception type of before second nestting */
+#if defined(__RISCV_XLEN) && __RISCV_XLEN == 64
+        rv_csr_t _reserved2:48;                 /*!< bit:     16..63 Reserved*/
+#else
+        rv_csr_t _reserved2:16;                 /*!< bit:     16..31 Reserved*/
+#endif
+    } b;                                        /*!< Structure used for bit  access */
+    rv_csr_t w;                                 /*!< Type      used for csr data access */
+} CSR_MSAVESTATUS_Type;
+/** @} */ /* End of Doxygen Group NMSIS_Core_Base_Registers */
+
+/* ###########################  Core Function Access  ########################### */
+/**
+ * \defgroup NMSIS_Core_CSR_Register_Access    Core CSR Register Access
+ * \ingroup  NMSIS_Core
+ * \brief    Functions to access the Core CSR Registers
+ * \details
+ *
+ * The following functions or macros provide access to Core CSR registers.
+ * - \ref NMSIS_Core_CSR_Encoding
+ * - \ref NMSIS_Core_CSR_Registers
+ *   @{
+ */
+
+
+#ifndef __ASSEMBLY__
+
+/**
+ * \brief CSR operation Macro for csrrw instruction.
+ * \details
+ * Read the content of csr register to __v,
+ * then write content of val into csr register, then return __v
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   value to store into the CSR register
+ * \return the CSR register value before written
+ */
+#define __RV_CSR_SWAP(csr, val)                                 \
+    ({                                                          \
+        register rv_csr_t __v = (unsigned long)(val);           \
+        __ASM volatile("csrrw %0, " STRINGIFY(csr) ", %1"       \
+                     : "=r"(__v)                                \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+        __v;                                                    \
+    })
+
+/**
+ * \brief CSR operation Macro for csrr instruction.
+ * \details
+ * Read the content of csr register to __v and return it
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \return the CSR register value
+ */
+#define __RV_CSR_READ(csr)                                      \
+    ({                                                          \
+        register rv_csr_t __v;                                  \
+        __ASM volatile("csrr %0, " STRINGIFY(csr)               \
+                     : "=r"(__v)                                \
+                     :                                          \
+                     : "memory");                               \
+        __v;                                                    \
+    })
+
+/**
+ * \brief CSR operation Macro for csrw instruction.
+ * \details
+ * Write the content of val to csr register
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   value to store into the CSR register
+ */
+#define __RV_CSR_WRITE(csr, val)                                \
+    ({                                                          \
+        register rv_csr_t __v = (rv_csr_t)(val);                \
+        __ASM volatile("csrw " STRINGIFY(csr) ", %0"            \
+                     :                                          \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+    })
+
+/**
+ * \brief CSR operation Macro for csrrs instruction.
+ * \details
+ * Read the content of csr register to __v,
+ * then set csr register to be __v | val, then return __v
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   Mask value to be used wih csrrs instruction
+ * \return the CSR register value before written
+ */
+#define __RV_CSR_READ_SET(csr, val)                             \
+    ({                                                          \
+        register rv_csr_t __v = (rv_csr_t)(val);                \
+        __ASM volatile("csrrs %0, " STRINGIFY(csr) ", %1"       \
+                     : "=r"(__v)                                \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+        __v;                                                    \
+    })
+
+/**
+ * \brief CSR operation Macro for csrs instruction.
+ * \details
+ * Set csr register to be csr_content | val
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   Mask value to be used wih csrs instruction
+ */
+#define __RV_CSR_SET(csr, val)                                  \
+    ({                                                          \
+        register rv_csr_t __v = (rv_csr_t)(val);                \
+        __ASM volatile("csrs " STRINGIFY(csr) ", %0"            \
+                     :                                          \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+    })
+
+/**
+ * \brief CSR operation Macro for csrrc instruction.
+ * \details
+ * Read the content of csr register to __v,
+ * then set csr register to be __v & ~val, then return __v
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   Mask value to be used wih csrrc instruction
+ * \return the CSR register value before written
+ */
+#define __RV_CSR_READ_CLEAR(csr, val)                           \
+    ({                                                          \
+        register rv_csr_t __v = (rv_csr_t)(val);                \
+        __ASM volatile("csrrc %0, " STRINGIFY(csr) ", %1"       \
+                     : "=r"(__v)                                \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+        __v;                                                    \
+    })
+
+/**
+ * \brief CSR operation Macro for csrc instruction.
+ * \details
+ * Set csr register to be csr_content & ~val
+ * \param csr   CSR macro definition defined in
+ *              \ref NMSIS_Core_CSR_Registers, eg. \ref CSR_MSTATUS
+ * \param val   Mask value to be used wih csrc instruction
+ */
+#define __RV_CSR_CLEAR(csr, val)                                \
+    ({                                                          \
+        register rv_csr_t __v = (rv_csr_t)(val);                \
+        __ASM volatile("csrc " STRINGIFY(csr) ", %0"            \
+                     :                                          \
+                     : "rK"(__v)                                \
+                     : "memory");                               \
+    })
+#endif /* __ASSEMBLY__ */
+
+/**
+ * \brief   Enable IRQ Interrupts
+ * \details Enables IRQ interrupts by setting the MIE-bit in the MSTATUS Register.
+ * \remarks
+ *          Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __enable_irq(void)
+{
+    __RV_CSR_SET(CSR_MSTATUS, MSTATUS_MIE);
+}
+
+/**
+ * \brief   Disable IRQ Interrupts
+ * \details Disables IRQ interrupts by clearing the MIE-bit in the MSTATUS Register.
+ * \remarks
+ *          Can only be executed in Privileged modes.
+ */
+__STATIC_FORCEINLINE void __disable_irq(void)
+{
+    __RV_CSR_CLEAR(CSR_MSTATUS, MSTATUS_MIE);
+}
+
+/**
+ * \brief   Read whole 64 bits value of mcycle counter
+ * \details This function will read the whole 64 bits of MCYCLE register
+ * \return  The whole 64 bits value of MCYCLE
+ * \remarks It will work for both RV32 and RV64 to get full 64bits value of MCYCLE
+ */
+__STATIC_FORCEINLINE uint64_t __get_rv_cycle(void)
+{
+#if __RISCV_XLEN == 32
+    volatile uint32_t high0, low, high;
+    uint64_t full;
+
+    high0 = __RV_CSR_READ(CSR_MCYCLEH);
+    low = __RV_CSR_READ(CSR_MCYCLE);
+    high = __RV_CSR_READ(CSR_MCYCLEH);
+    if (high0 != high) {
+        low = __RV_CSR_READ(CSR_MCYCLE);
+    }
+    full = (((uint64_t)high) << 32) | low;
+    return full;
+#elif __RISCV_XLEN == 64
+    return (uint64_t)__RV_CSR_READ(CSR_MCYCLE);
+#else // TODO Need cover for XLEN=128 case in future
+    return (uint64_t)__RV_CSR_READ(CSR_MCYCLE);
+#endif
+}
+
+/**
+ * \brief   Read whole 64 bits value of machine instruction-retired counter
+ * \details This function will read the whole 64 bits of MINSTRET register
+ * \return  The whole 64 bits value of MINSTRET
+ * \remarks It will work for both RV32 and RV64 to get full 64bits value of MINSTRET
+ */
+__STATIC_FORCEINLINE uint64_t __get_rv_instret(void)
+{
+#if __RISCV_XLEN == 32
+    volatile uint32_t high0, low, high;
+    uint64_t full;
+
+    high0 = __RV_CSR_READ(CSR_MINSTRETH);
+    low = __RV_CSR_READ(CSR_MINSTRET);
+    high = __RV_CSR_READ(CSR_MINSTRETH);
+    if (high0 != high) {
+        low = __RV_CSR_READ(CSR_MINSTRET);
+    }
+    full = (((uint64_t)high) << 32) | low;
+    return full;
+#elif __RISCV_XLEN == 64
+    return (uint64_t)__RV_CSR_READ(CSR_MINSTRET);
+#else // TODO Need cover for XLEN=128 case in future
+    return (uint64_t)__RV_CSR_READ(CSR_MINSTRET);
+#endif
+}
+
+/**
+ * \brief   Read whole 64 bits value of real-time clock
+ * \details This function will read the whole 64 bits of TIME register
+ * \return  The whole 64 bits value of TIME CSR
+ * \remarks It will work for both RV32 and RV64 to get full 64bits value of TIME
+ * \attention only available when user mode available
+ */
+__STATIC_FORCEINLINE uint64_t __get_rv_time(void)
+{
+#if __RISCV_XLEN == 32
+    volatile uint32_t high0, low, high;
+    uint64_t full;
+
+    high0 = __RV_CSR_READ(CSR_TIMEH);
+    low = __RV_CSR_READ(CSR_TIME);
+    high = __RV_CSR_READ(CSR_TIMEH);
+    if (high0 != high) {
+        low = __RV_CSR_READ(CSR_TIME);
+    }
+    full = (((uint64_t)high) << 32) | low;
+    return full;
+#elif __RISCV_XLEN == 64
+    return (uint64_t)__RV_CSR_READ(CSR_TIME);
+#else // TODO Need cover for XLEN=128 case in future
+    return (uint64_t)__RV_CSR_READ(CSR_TIME);
+#endif
+}
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_CSR_Register_Access */
+
+/* ###########################  CPU Intrinsic Functions ########################### */
+/**
+ * \defgroup NMSIS_Core_CPU_Intrinsic   Intrinsic Functions for CPU Intructions
+ * \ingroup  NMSIS_Core
+ * \brief    Functions that generate RISC-V CPU instructions.
+ * \details
+ *
+ * The following functions generate specified RISC-V instructions that cannot be directly accessed by compiler.
+ *   @{
+ */
+
+/**
+ * \brief   NOP Instruction
+ * \details
+ * No Operation does nothing.
+ * This instruction can be used for code alignment purposes.
+ */
+__STATIC_FORCEINLINE void __NOP(void)
+{
+    __ASM volatile("nop");
+}
+
+/**
+ * \brief   Wait For Interrupt
+ * \details
+ * Wait For Interrupt is is executed using CSR_WFE.WFE=0 and WFI instruction.
+ * It will suspends execution until interrupt, NMI or Debug happened.
+ * When Core is waked up by interrupt, if
+ * 1. mstatus.MIE == 1(interrupt enabled), Core will enter ISR code
+ * 2. mstatus.MIE == 0(interrupt disabled), Core will resume previous execution
+ */
+__STATIC_FORCEINLINE void __WFI(void)
+{
+    __RV_CSR_CLEAR(CSR_WFE, WFE_WFE);
+    __ASM volatile("wfi");
+}
+
+/**
+ * \brief   Wait For Event
+ * \details
+ * Wait For Event is executed using CSR_WFE.WFE=1 and WFI instruction.
+ * It will suspends execution until event, NMI or Debug happened.
+ * When Core is waked up, Core will resume previous execution
+ */
+__STATIC_FORCEINLINE void __WFE(void)
+{
+    __RV_CSR_SET(CSR_WFE, WFE_WFE);
+    __ASM volatile("wfi");
+    __RV_CSR_CLEAR(CSR_WFE, WFE_WFE);
+}
+
+/**
+ * \brief   Breakpoint Instruction
+ * \details
+ * Causes the processor to enter Debug state.
+ * Debug tools can use this to investigate system state
+ * when the instruction at a particular address is reached.
+ */
+__STATIC_FORCEINLINE void __EBREAK(void)
+{
+    __ASM volatile("ebreak");
+}
+
+/**
+ * \brief   Environment Call Instruction
+ * \details
+ * The ECALL instruction is used to make a service request to
+ * the execution environment.
+ */
+__STATIC_FORCEINLINE void __ECALL(void)
+{
+    __ASM volatile("ecall");
+}
+
+/**
+ * \brief WFI Sleep Mode enumeration
+ */
+typedef enum WFI_SleepMode {
+    WFI_SHALLOW_SLEEP = 0,      /*!< Shallow sleep mode, the core_clk will poweroff */
+    WFI_DEEP_SLEEP = 1          /*!< Deep sleep mode, the core_clk and core_ano_clk will poweroff */
+} WFI_SleepMode_Type;
+
+/**
+ * \brief   Set Sleep mode of WFI
+ * \details
+ * Set the SLEEPVALUE CSR register to control the
+ * WFI Sleep mode.
+ * \param[in] mode      The sleep mode to be set
+ */
+__STATIC_FORCEINLINE void __set_wfi_sleepmode(WFI_SleepMode_Type mode)
+{
+    __RV_CSR_WRITE(CSR_SLEEPVALUE, mode);
+}
+
+/**
+ * \brief   Send TX Event
+ * \details
+ * Set the CSR TXEVT to control send a TX Event.
+ * The Core will output signal tx_evt as output event signal.
+ */
+__STATIC_FORCEINLINE void __TXEVT(void)
+{
+    __RV_CSR_SET(CSR_TXEVT, 0x1);
+}
+
+/**
+ * \brief   Enable MCYCLE counter
+ * \details
+ * Clear the CY bit of MCOUNTINHIBIT to 0 to enable MCYCLE Counter
+ */
+__STATIC_FORCEINLINE void __enable_mcycle_counter(void)
+{
+    __RV_CSR_CLEAR(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_CY);
+}
+
+/**
+ * \brief   Disable MCYCLE counter
+ * \details
+ * Set the CY bit of MCOUNTINHIBIT to 1 to disable MCYCLE Counter
+ */
+__STATIC_FORCEINLINE void __disable_mcycle_counter(void)
+{
+    __RV_CSR_SET(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_CY);
+}
+
+/**
+ * \brief   Enable MINSTRET counter
+ * \details
+ * Clear the IR bit of MCOUNTINHIBIT to 0 to enable MINSTRET Counter
+ */
+__STATIC_FORCEINLINE void __enable_minstret_counter(void)
+{
+    __RV_CSR_CLEAR(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR);
+}
+
+/**
+ * \brief   Disable MINSTRET counter
+ * \details
+ * Set the IR bit of MCOUNTINHIBIT to 1 to disable MINSTRET Counter
+ */
+__STATIC_FORCEINLINE void __disable_minstret_counter(void)
+{
+    __RV_CSR_SET(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR);
+}
+
+/**
+ * \brief   Enable MCYCLE & MINSTRET counter
+ * \details
+ * Clear the IR and CY bit of MCOUNTINHIBIT to 1 to enable MINSTRET & MCYCLE Counter
+ */
+__STATIC_FORCEINLINE void __enable_all_counter(void)
+{
+    __RV_CSR_CLEAR(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR|MCOUNTINHIBIT_CY);
+}
+
+/**
+ * \brief   Disable MCYCLE & MINSTRET counter
+ * \details
+ * Set the IR and CY bit of MCOUNTINHIBIT to 1 to disable MINSTRET & MCYCLE Counter
+ */
+__STATIC_FORCEINLINE void __disable_all_counter(void)
+{
+    __RV_CSR_SET(CSR_MCOUNTINHIBIT, MCOUNTINHIBIT_IR|MCOUNTINHIBIT_CY);
+}
+
+/**
+ * \brief Execute fence instruction, p -> pred, s -> succ
+ * \details
+ * the FENCE instruction ensures that all memory accesses from instructions preceding
+ * the fence in program order (the `predecessor set`) appear earlier in the global memory order than
+ * memory accesses from instructions appearing after the fence in program order (the `successor set`).
+ * For details, please refer to The RISC-V Instruction Set Manual
+ * \param p     predecessor set, such as iorw, rw, r, w
+ * \param s     successor set, such as iorw, rw, r, w
+ **/
+#define __FENCE(p, s) __ASM volatile ("fence " #p "," #s : : : "memory")
+
+/**
+ * \brief   Fence.i Instruction
+ * \details
+ * The FENCE.I instruction is used to synchronize the instruction
+ * and data streams.
+ */
+__STATIC_FORCEINLINE void __FENCE_I(void)
+{
+    __ASM volatile("fence.i");
+}
+
+/** \brief Read & Write Memory barrier */
+#define __RWMB()        __FENCE(iorw,iorw)
+
+/** \brief Read Memory barrier */
+#define __RMB()         __FENCE(ir,ir)
+
+/** \brief Write Memory barrier */
+#define __WMB()         __FENCE(ow,ow)
+
+/** \brief SMP Read & Write Memory barrier */
+#define __SMP_RWMB()    __FENCE(rw,rw)
+
+/** \brief SMP Read Memory barrier */
+#define __SMP_RMB()     __FENCE(r,r)
+
+/** \brief SMP Write Memory barrier */
+#define __SMP_WMB()     __FENCE(w,w)
+
+/** \brief CPU relax for busy loop */
+#define __CPU_RELAX()   __ASM volatile ("" : : : "memory")
+
+
+/* ===== Load/Store Operations ===== */
+/**
+ * \brief  Load 8bit value from address (8 bit)
+ * \details Load 8 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \return              value of type uint8_t at (*addr)
+ */
+__STATIC_FORCEINLINE uint8_t __LB(volatile void *addr)
+{
+    uint8_t result;
+
+    __ASM volatile ("lb %0, 0(%1)" : "=r" (result) : "r" (addr));
+    return result;
+}
+
+/**
+ * \brief  Load 16bit value from address (16 bit)
+ * \details Load 16 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \return              value of type uint16_t at (*addr)
+ */
+__STATIC_FORCEINLINE uint16_t __LH(volatile void *addr)
+{
+    uint16_t result;
+
+    __ASM volatile ("lh %0, 0(%1)" : "=r" (result) : "r" (addr));
+    return result;
+}
+
+/**
+ * \brief  Load 32bit value from address (32 bit)
+ * \details Load 32 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \return              value of type uint32_t at (*addr)
+ */
+__STATIC_FORCEINLINE uint32_t __LW(volatile void *addr)
+{
+    uint32_t result;
+
+    __ASM volatile ("lw %0, 0(%1)" : "=r" (result) : "r" (addr));
+    return result;
+}
+
+#if __RISCV_XLEN != 32
+/**
+ * \brief  Load 64bit value from address (64 bit)
+ * \details Load 64 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \return              value of type uint64_t at (*addr)
+ * \remarks RV64 only macro
+ */
+__STATIC_FORCEINLINE uint64_t __LD(volatile void *addr)
+{
+    uint64_t result;
+    __ASM volatile ("ld %0, 0(%1)" : "=r" (result) : "r" (addr));
+    return result;
+}
+#endif
+
+/**
+ * \brief  Write 8bit value to address (8 bit)
+ * \details Write 8 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \param [in]    val   Value to set
+ */
+__STATIC_FORCEINLINE void __SB(volatile void *addr, uint8_t val)
+{
+    __ASM volatile ("sb %0, 0(%1)" : : "r" (val), "r" (addr));
+}
+
+/**
+ * \brief  Write 16bit value to address (16 bit)
+ * \details Write 16 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \param [in]    val   Value to set
+ */
+__STATIC_FORCEINLINE void __SH(volatile void *addr, uint16_t val)
+{
+    __ASM volatile ("sh %0, 0(%1)" : : "r" (val), "r" (addr));
+}
+
+/**
+ * \brief  Write 32bit value to address (32 bit)
+ * \details Write 32 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \param [in]    val   Value to set
+ */
+__STATIC_FORCEINLINE void __SW(volatile void *addr, uint32_t val)
+{
+    __ASM volatile ("sw %0, 0(%1)" : : "r" (val), "r" (addr));
+}
+
+#if __RISCV_XLEN != 32
+/**
+ * \brief  Write 64bit value to address (64 bit)
+ * \details Write 64 bit value.
+ * \param [in]    addr  Address pointer to data
+ * \param [in]    val   Value to set
+ */
+__STATIC_FORCEINLINE void __SD(volatile void *addr, uint64_t val)
+{
+    __ASM volatile ("sd %0, 0(%1)" : : "r" (val), "r" (addr));
+}
+#endif
+
+/**
+ * \brief  Compare and Swap 32bit value using LR and SC
+ * \details Compare old value with memory, if identical,
+ * store new value in memory. Return the initial value in memory.
+ * Success is indicated by comparing return value with OLD.
+ * memory address, return 0 if successful, otherwise return !0
+ * \param [in]    addr      Address pointer to data, address need to be 4byte aligned
+ * \param [in]    oldval    Old value of the data in address
+ * \param [in]    newval    New value to be stored into the address
+ * \return  return the initial value in memory
+ */
+__STATIC_FORCEINLINE uint32_t __CAS_W(volatile uint32_t *addr, uint32_t oldval, uint32_t newval)
+{
+    register uint32_t result;
+    register uint32_t rc;
+
+    __ASM volatile (                                \
+            "0:     lr.w %0, %2      \n"            \
+            "       bne  %0, %z3, 1f \n"            \
+            "       sc.w %1, %z4, %2 \n"            \
+            "       bnez %1, 0b      \n"            \
+            "1:\n"                                  \
+            : "=&r"(result), "=&r"(rc), "+A"(*addr) \
+            : "r"(oldval), "r"(newval)              \
+            : "memory");
+    return result;
+}
+
+/**
+ * \brief  Atomic Swap 32bit value into memory
+ * \details Atomically swap new 32bit value into memory using amoswap.d.
+ * \param [in]    addr      Address pointer to data, address need to be 4byte aligned
+ * \param [in]    newval    New value to be stored into the address
+ * \return  return the original value in memory
+ */
+__STATIC_FORCEINLINE uint32_t __AMOSWAP_W(volatile uint32_t *addr, uint32_t newval)
+{
+    register uint32_t result;
+
+    __ASM volatile ("amoswap.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(newval) : "memory");
+    return result;
+}
+
+/**
+ * \brief  Atomic Add with 32bit value
+ * \details Atomically ADD 32bit value with value in memory using amoadd.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be ADDed
+ * \return  return memory value + add value
+ */
+__STATIC_FORCEINLINE int32_t __AMOADD_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amoadd.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic And with 32bit value
+ * \details Atomically AND 32bit value with value in memory using amoand.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be ANDed
+ * \return  return memory value & and value
+ */
+__STATIC_FORCEINLINE int32_t __AMOAND_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amoand.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic OR with 32bit value
+ * \details Atomically OR 32bit value with value in memory using amoor.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be ORed
+ * \return  return memory value | and value
+ */
+__STATIC_FORCEINLINE int32_t __AMOOR_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amoor.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic XOR with 32bit value
+ * \details Atomically XOR 32bit value with value in memory using amoxor.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be XORed
+ * \return  return memory value ^ and value
+ */
+__STATIC_FORCEINLINE int32_t __AMOXOR_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amoxor.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic unsigned MAX with 32bit value
+ * \details Atomically unsigned max compare 32bit value with value in memory using amomaxu.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be compared
+ * \return  return the bigger value
+ */
+__STATIC_FORCEINLINE uint32_t __AMOMAXU_W(volatile uint32_t *addr, uint32_t value)
+{
+    register uint32_t result;
+
+    __ASM volatile ("amomaxu.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic signed MAX with 32bit value
+ * \details Atomically signed max compare 32bit value with value in memory using amomax.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be compared
+ * \return the bigger value
+ */
+__STATIC_FORCEINLINE int32_t __AMOMAX_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amomax.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic unsigned MIN with 32bit value
+ * \details Atomically unsigned min compare 32bit value with value in memory using amominu.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be compared
+ * \return the smaller value
+ */
+__STATIC_FORCEINLINE uint32_t __AMOMINU_W(volatile uint32_t *addr, uint32_t value)
+{
+    register uint32_t result;
+
+    __ASM volatile ("amominu.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic signed MIN with 32bit value
+ * \details Atomically signed min compare 32bit value with value in memory using amomin.d.
+ * \param [in]    addr   Address pointer to data, address need to be 4byte aligned
+ * \param [in]    value  value to be compared
+ * \return  the smaller value
+ */
+__STATIC_FORCEINLINE int32_t __AMOMIN_W(volatile int32_t *addr, int32_t value)
+{
+    register int32_t result;
+
+    __ASM volatile ("amomin.w %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+#if __RISCV_XLEN == 64
+/**
+ * \brief  Compare and Swap 64bit value using LR and SC
+ * \details Compare old value with memory, if identical,
+ * store new value in memory. Return the initial value in memory.
+ * Success is indicated by comparing return value with OLD.
+ * memory address, return 0 if successful, otherwise return !0
+ * \param [in]    addr      Address pointer to data, address need to be 8byte aligned
+ * \param [in]    oldval    Old value of the data in address
+ * \param [in]    newval    New value to be stored into the address
+ * \return  return the initial value in memory
+ */
+__STATIC_FORCEINLINE uint64_t __CAS_D(volatile uint64_t *addr, uint64_t oldval, uint64_t newval)
+{
+    register uint64_t result;
+    register uint64_t rc;
+
+    __ASM volatile (                                \
+            "0:     lr.d %0, %2      \n"            \
+            "       bne  %0, %z3, 1f \n"            \
+            "       sc.d %1, %z4, %2 \n"            \
+            "       bnez %1, 0b      \n"            \
+            "1:\n"                                  \
+            : "=&r"(result), "=&r"(rc), "+A"(*addr) \
+            : "r"(oldval), "r"(newval)              \
+            : "memory");
+    return result;
+}
+
+/**
+ * \brief  Atomic Swap 64bit value into memory
+ * \details Atomically swap new 64bit value into memory using amoswap.d.
+ * \param [in]    addr      Address pointer to data, address need to be 8byte aligned
+ * \param [in]    newval    New value to be stored into the address
+ * \return  return the original value in memory
+ */
+__STATIC_FORCEINLINE uint64_t __AMOSWAP_D(volatile uint64_t *addr, uint64_t newval)
+{
+    register uint64_t result;
+
+    __ASM volatile ("amoswap.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(newval) : "memory");
+    return result;
+}
+
+/**
+ * \brief  Atomic Add with 64bit value
+ * \details Atomically ADD 64bit value with value in memory using amoadd.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be ADDed
+ * \return  return memory value + add value
+ */
+__STATIC_FORCEINLINE int64_t __AMOADD_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amoadd.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic And with 64bit value
+ * \details Atomically AND 64bit value with value in memory using amoand.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be ANDed
+ * \return  return memory value & and value
+ */
+__STATIC_FORCEINLINE int64_t __AMOAND_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amoand.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic OR with 64bit value
+ * \details Atomically OR 64bit value with value in memory using amoor.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be ORed
+ * \return  return memory value | and value
+ */
+__STATIC_FORCEINLINE int64_t __AMOOR_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amoor.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic XOR with 64bit value
+ * \details Atomically XOR 64bit value with value in memory using amoxor.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be XORed
+ * \return  return memory value ^ and value
+ */
+__STATIC_FORCEINLINE int64_t __AMOXOR_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amoxor.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic unsigned MAX with 64bit value
+ * \details Atomically unsigned max compare 64bit value with value in memory using amomaxu.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be compared
+ * \return  return the bigger value
+ */
+__STATIC_FORCEINLINE uint64_t __AMOMAXU_D(volatile uint64_t *addr, uint64_t value)
+{
+    register uint64_t result;
+
+    __ASM volatile ("amomaxu.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic signed MAX with 64bit value
+ * \details Atomically signed max compare 64bit value with value in memory using amomax.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be compared
+ * \return the bigger value
+ */
+__STATIC_FORCEINLINE int64_t __AMOMAX_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amomax.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic unsigned MIN with 64bit value
+ * \details Atomically unsigned min compare 64bit value with value in memory using amominu.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be compared
+ * \return the smaller value
+ */
+__STATIC_FORCEINLINE uint64_t __AMOMINU_D(volatile uint64_t *addr, uint64_t value)
+{
+    register uint64_t result;
+
+    __ASM volatile ("amominu.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+
+/**
+ * \brief  Atomic signed MIN with 64bit value
+ * \details Atomically signed min compare 64bit value with value in memory using amomin.d.
+ * \param [in]    addr   Address pointer to data, address need to be 8byte aligned
+ * \param [in]    value  value to be compared
+ * \return  the smaller value
+ */
+__STATIC_FORCEINLINE int64_t __AMOMIN_D(volatile int64_t *addr, int64_t value)
+{
+    register int64_t result;
+
+    __ASM volatile ("amomin.d %0, %2, %1" : \
+            "=r"(result), "+A"(*addr) : "r"(value) : "memory");
+    return *addr;
+}
+#endif /* __RISCV_XLEN == 64  */
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_CPU_Intrinsic */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __CORE_FEATURE_BASE__ */
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_cache.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_cache.h
new file mode 100644
index 000000000..38b9eb972
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_cache.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_CACHE_H__
+#define __CORE_FEATURE_CACHE_H__
+/*!
+ * @file     core_feature_cache.h
+ * @brief    Cache feature API header file for Nuclei N/NX Core
+ */
+/*
+ * Cache Feature Configuration Macro:
+ * 1. __ICACHE_PRESENT:  Define whether I-Cache Unit is present or not.
+ *   * 0: Not present
+ *   * 1: Present
+ * 1. __DCACHE_PRESENT:  Define whether D-Cache Unit is present or not.
+ *   * 0: Not present
+ *   * 1: Present
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if defined(__ICACHE_PRESENT) && (__ICACHE_PRESENT == 1)
+
+/* ##########################  Cache functions  #################################### */
+/**
+ * \defgroup NMSIS_Core_Cache       Cache Functions
+ * \brief    Functions that configure Instruction and Data Cache.
+ * @{
+ */
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_Cache */
+
+/**
+ * \defgroup NMSIS_Core_ICache      I-Cache Functions
+ * \ingroup  NMSIS_Core_Cache
+ * \brief    Functions that configure Instruction Cache.
+ * @{
+ */
+/**
+ * \brief  Enable ICache
+ * \details
+ * This function enable I-Cache
+ * \remarks
+ * - This \ref CSR_MCACHE_CTL register control I Cache enable.
+ * \sa
+ * - \ref DisableICache
+*/
+__STATIC_FORCEINLINE void EnableICache (void)
+{
+    __RV_CSR_SET(CSR_MCACHE_CTL, CSR_MCACHE_CTL_IE);
+}
+
+/**
+ * \brief  Disable ICache
+ * \details
+ * This function Disable I-Cache
+ * \remarks
+ * - This \ref CSR_MCACHE_CTL register control I Cache enable.
+ * \sa
+ * - \ref EnableICache
+ */
+__STATIC_FORCEINLINE void DisableICache (void)
+{
+    __RV_CSR_CLEAR(CSR_MCACHE_CTL, CSR_MCACHE_CTL_IE);
+}
+/** @} */ /* End of Doxygen Group NMSIS_Core_ICache */
+#endif /* defined(__ICACHE_PRESENT) && (__ICACHE_PRESENT == 1) */
+
+#if defined(__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1)
+/**
+ * \defgroup NMSIS_Core_DCache      D-Cache Functions
+ * \ingroup  NMSIS_Core_Cache
+ * \brief    Functions that configure Data Cache.
+ * @{
+ */
+/**
+ * \brief  Enable DCache
+ * \details
+ * This function enable D-Cache
+ * \remarks
+ * - This \ref CSR_MCACHE_CTL register control D Cache enable.
+ * \sa
+ * - \ref DisableDCache
+*/
+__STATIC_FORCEINLINE void EnableDCache (void)
+{
+    __RV_CSR_SET(CSR_MCACHE_CTL, CSR_MCACHE_CTL_DE);
+}
+
+/**
+ * \brief  Disable DCache
+ * \details
+ * This function Disable D-Cache
+ * \remarks
+ * - This \ref CSR_MCACHE_CTL register control D Cache enable.
+ * \sa
+ * - \ref EnableDCache
+ */
+__STATIC_FORCEINLINE void DisableDCache (void)
+{
+    __RV_CSR_CLEAR(CSR_MCACHE_CTL, CSR_MCACHE_CTL_DE);
+}
+/** @} */ /* End of Doxygen Group NMSIS_Core_DCache */
+#endif /* defined(__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1) */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /** __CORE_FEATURE_CACHE_H__ */
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_dsp.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_dsp.h
new file mode 100644
index 000000000..4d41e553e
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_dsp.h
@@ -0,0 +1,18659 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_DSP__
+#define __CORE_FEATURE_DSP__
+
+/*!
+ * @file     core_feature_dsp.h
+ * @brief    DSP feature API header file for Nuclei N/NX Core
+ */
+/*
+ * DSP Feature Configuration Macro:
+ * 1. __DSP_PRESENT:  Define whether Digital Signal Processing Unit(DSP) is present or not
+ *   * 0: Not present
+ *   * 1: Present
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if defined(__DSP_PRESENT) && (__DSP_PRESENT == 1)
+
+/* ###########################  CPU SIMD DSP Intrinsic Functions ########################### */
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic   Intrinsic Functions for SIMD Instructions
+ * \ingroup  NMSIS_Core
+ * \brief    Functions that generate RISC-V DSP SIMD instructions.
+ * \details
+ *
+ * The following functions generate specified RISC-V SIMD instructions that cannot be directly accessed by compiler.
+ * * **DSP ISA Extension Instruction Summary**
+ *   + **Shorthand Definitions**
+ *     - r.H == rH1: r[31:16], r.L == r.H0: r[15:0]
+ *     - r.B3: r[31:24], r.B2: r[23:16], r.B1: r[15:8], r.B0: r[7:0]
+ *     - r.B[x]: r[(x*8+7):(x*8+0)]
+ *     - r.H[x]: r[(x*16+7):(x*16+0)]
+ *     - r.W[x]: r[(x*32+31):(x*32+0)]
+ *     - r[xU]: the upper 32-bit of a 64-bit number; xU represents the GPR number that contains this upper part 32-bit value.
+ *     - r[xL]: the lower 32-bit of a 64-bit number; xL represents the GPR number that contains this lower part 32-bit value.
+ *     - r[xU].r[xL]: a 64-bit number that is formed from a pair of GPRs.
+ *     - s>>: signed arithmetic right shift:
+ *     - u>>: unsigned logical right shift
+ *     - SAT.Qn(): Saturate to the range of [-2^n, 2^n-1], if saturation happens, set PSW.OV.
+ *     - SAT.Um(): Saturate to the range of [0, 2^m-1], if saturation happens, set PSW.OV.
+ *     - RUND(): Indicate `rounding`, i.e., add 1 to the most significant discarded bit for right shift or MSW-type multiplication instructions.
+ *     - Sign or Zero Extending functions:
+ *       - SEm(data): Sign-Extend data to m-bit.:
+ *       - ZEm(data): Zero-Extend data to m-bit.
+ *     - ABS(x): Calculate the absolute value of `x`.
+ *     - CONCAT(x,y): Concatinate `x` and `y` to form a value.
+ *     - u<: Unsinged less than comparison.
+ *     - u<=: Unsinged less than & equal comparison.
+ *     - u>: Unsinged greater than comparison.
+ *     - s*: Signed multiplication.
+ *     - u*: Unsigned multiplication.
+ *
+ *   @{
+ */
+/** @} */ /* End of Doxygen Group NMSIS_Core_DSP_Intrinsic */
+
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS      SIMD Data Processing Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    SIMD Data Processing Instructions
+ * \details
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB      SIMD 16-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Add/Subtract Instructions
+ * \details
+ * Based on the combination of the types of the two 16-bit arithmetic operations, the SIMD 16-bit
+ * add/subtract instructions can be classified into 6 main categories: Addition (two 16-bit addition),
+ * Subtraction (two 16-bit subtraction), Crossed Add & Sub (one addition and one subtraction), and
+ * Crossed Sub & Add (one subtraction and one addition), Straight Add & Sub (one addition and one
+ * subtraction), and Straight Sub & Add (one subtraction and one addition).
+ * Based on the way of how an overflow condition is handled, the SIMD 16-bit add/subtract
+ * instructions can be classified into 5 groups: Wrap-around (dropping overflow), Signed Halving
+ * (keeping overflow by dropping 1 LSB bit), Unsigned Halving, Signed Saturation (clipping overflow),
+ * and Unsigned Saturation.
+ * Together, there are 30 SIMD 16-bit add/subtract instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB      SIMD 8-bit Addition & Subtraction Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Addition & Subtraction Instructions
+ * \details
+ * Based on the types of the four 8-bit arithmetic operations, the SIMD 8-bit add/subtract instructions
+ * can be classified into 2 main categories: Addition (four 8-bit addition), and Subtraction (four 8-bit
+ * subtraction).
+ * Based on the way of how an overflow condition is handled for singed or unsigned operation, the
+ * SIMD 8-bit add/subtract instructions can be classified into 5 groups: Wrap-around (dropping
+ * overflow), Signed Halving (keeping overflow by dropping 1 LSB bit), Unsigned Halving, Signed
+ * Saturation (clipping overflow), and Unsigned Saturation.
+ * Together, there are 10 SIMD 8-bit add/subtract instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT      SIMD 16-bit Shift Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Shift Instructions
+ * \details
+ * there are 14 SIMD 16-bit shift instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT      SIMD 8-bit Shift Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Shift Instructions
+ * \details
+ *  there are 14 SIMD 8-bit shift instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP      SIMD 16-bit Compare Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Compare Instructions
+ * \details
+ *  there are 5 SIMD 16-bit Compare instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP      SIMD 8-bit Compare Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Compare Instructions
+ * \details
+ *  there are 5  SIMD 8-bit Compare instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY      SIMD 16-bit Multiply Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Multiply Instructions
+ * \details
+ * there are 6 SIMD 16-bit Multiply instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY      SIMD 8-bit Multiply Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Multiply Instructions
+ * \details
+ *  there are 6 SIMD 8-bit Multiply instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC      SIMD 16-bit Miscellaneous Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Miscellaneous Instructions
+ * \details
+ *  there are 10 SIMD 16-bit Misc instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC      SIMD 8-bit Miscellaneous Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Miscellaneous Instructions
+ * \details
+ *  there are 10 SIMD 8-bit Miscellaneous instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK      SIMD 8-bit Unpacking Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_SIMD_DATA_PROCESS
+ * \brief    SIMD 8-bit Unpacking Instructions
+ * \details
+ *  there are 8 SIMD 8-bit Unpacking instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD      Non-SIMD Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    Non-SIMD Instructions
+ * \details
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU      Non-SIMD Q15 saturation ALU Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
+ * \brief    Non-SIMD Q15 saturation ALU Instructions
+ * \details
+ * there are 7 Non-SIMD Q15 saturation ALU Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU      Non-SIMD Q31 saturation ALU Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
+ * \brief    Non-SIMD Q31 saturation ALU Instructions
+ * \details
+ *  there are Non-SIMD Q31 saturation ALU Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION      32-bit Computation Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
+ * \brief    32-bit Computation Instructions
+ * \details
+ * there are 8 32-bit Computation Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC      OV (Overflow) flag Set/Clear Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
+ * \brief    OV (Overflow) flag Set/Clear Instructions
+ * \details
+ * The following table lists the user instructions related to Overflow (OV) flag manipulation. there are 2 OV (Overflow) flag Set/Clear Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC      Non-SIMD Miscellaneous Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NON_SIMD
+ * \brief    Non-SIMD Miscellaneous Instructions
+ * \details
+ * There are 13 Miscellaneous Instructions here.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS      Partial-SIMD Data Processing Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    Partial-SIMD Data Processing Instructions
+ * \details
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK      SIMD 16-bit Packing Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    SIMD 16-bit Packing Instructions
+ * \details
+ * there are 4 SIMD16-bit Packing Instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC      Signed MSW 32x32 Multiply and Add Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    Signed MSW 32x32 Multiply and Add Instructions
+ * \details
+ *  there are 8 Signed MSW 32x32 Multiply and Add Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC      Signed MSW 32x16 Multiply and Add Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    Signed MSW 32x16 Multiply and Add Instructions
+ * \details
+ * there are 15 Signed MSW 32x16 Multiply and Add Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB      Signed 16-bit Multiply 32-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    Signed 16-bit Multiply 32-bit Add/Subtract Instructions
+ * \details
+ *  there are 18 Signed 16-bit Multiply 32-bit Add/Subtract Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB      Signed 16-bit Multiply 64-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    Signed 16-bit Multiply 64-bit Add/Subtract Instructions
+ * \details
+ *  there is Signed 16-bit Multiply 64-bit Add/Subtract Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC      Partial-SIMD Miscellaneous Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    Partial-SIMD Miscellaneous Instructions
+ * \details
+ *  there are  7 Partial-SIMD Miscellaneous Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD      8-bit Multiply with 32-bit Add Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_PART_SIMD_DATA_PROCESS
+ * \brief    8-bit Multiply with 32-bit Add Instructions
+ * \details
+ * there are  3 8-bit Multiply with 32-bit Add Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_64B_PROFILE      64-bit Profile Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    64-bit Profile Instructions
+ * \details
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB      64-bit Addition & Subtraction Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_64B_PROFILE
+ * \brief    64-bit Addition & Subtraction Instructions
+ * \details
+ * there are 10 64-bit Addition & Subtraction Instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB      32-bit Multiply with 64-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_64B_PROFILE
+ * \brief    32-bit Multiply with 64-bit Add/Subtract Instructions
+ * \details
+ *  there are 32-bit Multiply 64-bit Add/Subtract Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB      Signed 16-bit Multiply with 64-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_64B_PROFILE
+ * \brief    Signed 16-bit Multiply with 64-bit Add/Subtract Instructions
+ * \details
+ * there are 10 Signed 16-bit Multiply with 64-bit Add/Subtract Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_ONLY      RV64 Only Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    RV64 Only Instructions
+ * \details
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB      (RV64 Only) SIMD 32-bit Add/Subtract Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) SIMD 32-bit Add/Subtract Instructions
+ * \details
+ * The following tables list instructions that are only present in RV64.
+ * There are 30 SIMD 32-bit addition or subtraction instructions.there are 4 SIMD16-bit Packing Instructions.
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT      (RV64 Only) SIMD 32-bit Shift Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) SIMD 32-bit Shift Instructions
+ * \details
+ *  there are 14 (RV64 Only) SIMD 32-bit Shift Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC      (RV64 Only) SIMD 32-bit Miscellaneous Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) SIMD 32-bit Miscellaneous Instructions
+ * \details
+ * there are 5  (RV64 Only) SIMD 32-bit Miscellaneous Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT      (RV64 Only) SIMD Q15 Saturating Multiply Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) SIMD Q15 Saturating Multiply Instructions
+ * \details
+ *  there are 9 (RV64 Only) SIMD Q15 saturating Multiply Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT      (RV64 Only) 32-bit Multiply Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) 32-bit Multiply Instructions
+ * \details
+ *  there is 3 RV64 Only) 32-bit Multiply Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD      (RV64 Only) 32-bit Multiply & Add Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) 32-bit Multiply & Add Instructions
+ * \details
+ *  there are  3 (RV64 Only) 32-bit Multiply & Add Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC      (RV64 Only) 32-bit Parallel Multiply & Add Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) 32-bit Parallel Multiply & Add Instructions
+ * \details
+ * there are 12 (RV64 Only) 32-bit Parallel Multiply & Add Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_NON_SIMD_32B_SHIFT      (RV64 Only) Non-SIMD 32-bit Shift Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    (RV64 Only) Non-SIMD 32-bit Shift Instructions
+ * \details
+ *  there are 1  (RV64 Only) Non-SIMD 32-bit Shift Instructions
+ */
+
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK      32-bit Packing Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_RV64_ONLY
+ * \brief    32-bit Packing Instructions
+ * \details
+ *  There are four 32-bit packing instructions here
+ */
+
+/* ===== Inline Function Start for 3.1. ADD8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief ADD8 (SIMD 8-bit Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ADD8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit integer element additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit integer elements in Rs1 with the 8-bit integer elements
+ * in Rs2, and then writes the 8-bit element results to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = Rs1.B[x] + Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ADD8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("add8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.1. ADD8 ===== */
+
+/* ===== Inline Function Start for 3.2. ADD16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief ADD16 (SIMD 16-bit Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ADD16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit integer elements in Rs1 with the 16-bit integer
+ * elements in Rs2, and then writes the 16-bit element results to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = Rs1.H[x] + Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ADD16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("add16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.2. ADD16 ===== */
+
+/* ===== Inline Function Start for 3.3. ADD64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief ADD64 (64-bit Addition)
+ * \details
+ * **Type**: 64-bit Profile
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ADD64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add two 64-bit signed or unsigned integers.
+ *
+ * **RV32 Description**:\n
+ * This instruction adds the 64-bit integer of an even/odd pair of registers specified
+ * by Rs1(4,1) with the 64-bit integer of an even/odd pair of registers specified by Rs2(4,1), and then
+ * writes the 64-bit result to an even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction has the same behavior as the ADD instruction in RV64I.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ *  t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ *  a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ *  b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ *  R[t_H].R[t_L] = R[a_H].R[a_L] + R[b_H].R[b_L];
+ * RV64:
+ *  Rd = Rs1 + Rs2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_ADD64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("add64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.3. ADD64 ===== */
+
+/* ===== Inline Function Start for 3.4. AVE ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief AVE (Average with Rounding)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * AVE Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Calculate the average of the contents of two general registers.
+ *
+ * **Description**:\n
+ * This instruction calculates the average value of two signed integers stored in Rs1 and
+ * Rs2, rounds up a half-integer result to the nearest integer, and writes the result to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Sum = CONCAT(Rs1[MSB],Rs1[MSB:0]) + CONCAT(Rs2[MSB],Rs2[MSB:0]) + 1;
+ * Rd = Sum[(MSB+1):1];
+ * for RV32: MSB=31,
+ * for RV64: MSB=63
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_AVE(long a, long b)
+{
+    register long result;
+    __ASM volatile("ave %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.4. AVE ===== */
+
+/* ===== Inline Function Start for 3.5. BITREV ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief BITREV (Bit Reverse)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * BITREV Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Reverse the bit positions of the source operand within a specified width starting from bit
+ * 0. The reversed width is a variable from a GPR.
+ *
+ * **Description**:\n
+ * This instruction reverses the bit positions of the content of Rs1. The reversed bit width
+ * is calculated as Rs2[4:0]+1 (RV32) or Rs2[5:0]+1 (RV64). The upper bits beyond the reversed width
+ * are filled with zeros. After the bit reverse operation, the result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * msb = Rs2[4:0]; (for RV32)
+ * msb = Rs2[5:0]; (for RV64)
+ * rev[0:msb] = Rs1[msb:0];
+ * Rd = ZE(rev[msb:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_BITREV(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("bitrev %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.5. BITREV ===== */
+
+/* ===== Inline Function Start for 3.6. BITREVI ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief BITREVI (Bit Reverse Immediate)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * (RV32) BITREVI Rd, Rs1, imm[4:0]
+ * (RV64) BITREVI Rd, Rs1, imm[5:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Reverse the bit positions of the source operand within a specified width starting from bit
+ * 0. The reversed width is an immediate value.
+ *
+ * **Description**:\n
+ * This instruction reverses the bit positions of the content of Rs1. The reversed bit width
+ * is calculated as imm[4:0]+1 (RV32) or imm[5:0]+1 (RV64). The upper bits beyond the reversed width
+ * are filled with zeros. After the bit reverse operation, the result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * msb = imm[4:0]; (RV32)
+ * msb = imm[5:0]; (RV64)
+ * rev[0:msb] = Rs1[msb:0];
+ * Rd = ZE(rev[msb:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_BITREVI(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("bitrevi %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.6. BITREVI ===== */
+
+/* ===== Inline Function Start for 3.7. BPICK ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief BPICK (Bit-wise Pick)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * BPICK Rd, Rs1, Rs2, Rc
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Select from two source operands based on a bit mask in the third operand.
+ *
+ * **Description**:\n
+ * This instruction selects individual bits from Rs1 or Rs2, based on the bit mask value in
+ * Rc. If a bit in Rc is 1, the corresponding bit is from Rs1; otherwise, the corresponding bit is from Rs2.
+ * The selection results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd[x] = Rc[x]? Rs1[x] : Rs2[x];
+ * for RV32, x=31...0
+ * for RV64, x=63...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \param [in]  c    unsigned long type of value stored in c
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_BPICK(unsigned long a, unsigned long b, unsigned long c)
+{
+    register unsigned long result;
+    __ASM volatile("bpick %0, %1, %2, %3" : "=r"(result) : "r"(a), "r"(b), "r"(c));
+    return result;
+}
+/* ===== Inline Function End for 3.7. BPICK ===== */
+
+/* ===== Inline Function Start for 3.8. CLROV ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC
+ * \brief CLROV (Clear OV flag)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLROV # pseudo mnemonic
+ * ~~~
+ *
+ * **Purpose**:\n
+ * This pseudo instruction is an alias to `CSRRCI x0, ucode, 1` instruction.
+ *
+ *
+ */
+__STATIC_FORCEINLINE void __RV_CLROV(void)
+{
+    __ASM volatile("clrov ");
+}
+/* ===== Inline Function End for 3.8. CLROV ===== */
+
+/* ===== Inline Function Start for 3.9. CLRS8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief CLRS8 (SIMD 8-bit Count Leading Redundant Sign)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLRS8 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of redundant sign bits of the 8-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the bits next to the sign bits of the 8-bit elements of Rs1, this instruction
+ * counts the number of redundant sign bits and writes the result to the corresponding 8-bit elements
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.B[x];
+ * cnt[x] = 0;
+ * for (i = 6 to 0) {
+ *   if (snum[x](i) == snum[x](7)) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.B[x] = cnt[x];
+ * for RV32: x=3...0
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLRS8(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clrs8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.9. CLRS8 ===== */
+
+/* ===== Inline Function Start for 3.10. CLRS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief CLRS16 (SIMD 16-bit Count Leading Redundant Sign)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLRS16 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of redundant sign bits of the 16-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the bits next to the sign bits of the 16-bit elements of Rs1, this
+ * instruction counts the number of redundant sign bits and writes the result to the corresponding 16-
+ * bit elements of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.H[x];
+ * cnt[x] = 0;
+ * for (i = 14 to 0) {
+ *   if (snum[x](i) == snum[x](15)) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.H[x] = cnt[x];
+ * for RV32: x=1...0
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLRS16(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clrs16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.10. CLRS16 ===== */
+
+/* ===== Inline Function Start for 3.11. CLRS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief CLRS32 (SIMD 32-bit Count Leading Redundant Sign)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLRS32 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of redundant sign bits of the 32-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the bits next to the sign bits of the 32-bit elements of Rs1, this
+ * instruction counts the number of redundant sign bits and writes the result to the corresponding 32-
+ * bit elements of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.W[x];
+ * cnt[x] = 0;
+ * for (i = 30 to 0) {
+ *   if (snum[x](i) == snum[x](31)) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.W[x] = cnt[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLRS32(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clrs32 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.11. CLRS32 ===== */
+
+/* ===== Inline Function Start for 3.12. CLO8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief CLO8 (SIMD 8-bit Count Leading One)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLO8 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading one bits of the 8-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 8-bit elements of Rs1, this instruction
+ * counts the number of leading one bits and writes the results to the corresponding 8-bit elements of
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.B[x];
+ * cnt[x] = 0;
+ *   for (i = 7 to 0) {
+ *   if (snum[x](i) == 1) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.B[x] = cnt[x];
+ * for RV32: x=3...0
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLO8(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clo8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.12. CLO8 ===== */
+
+/* ===== Inline Function Start for 3.13. CLO16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief CLO16 (SIMD 16-bit Count Leading One)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLO16 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading one bits of the 16-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 16-bit elements of Rs1, this instruction
+ * counts the number of leading one bits and writes the results to the corresponding 16-bit elements
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.H[x];
+ * cnt[x] = 0;
+ * for (i = 15 to 0) {
+ *   if (snum[x](i) == 1) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.H[x] = cnt[x];
+ * for RV32: x=1...0
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLO16(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clo16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.13. CLO16 ===== */
+
+/* ===== Inline Function Start for 3.14. CLO32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief CLO32 (SIMD 32-bit Count Leading One)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLO32 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading one bits of the 32-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 32-bit elements of Rs1, this instruction
+ * counts the number of leading one bits and writes the results to the corresponding 32-bit elements
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.W[x];
+ * cnt[x] = 0;
+ * for (i = 31 to 0) {
+ *   if (snum[x](i) == 1) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.W[x] = cnt[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLO32(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clo32 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.14. CLO32 ===== */
+
+/* ===== Inline Function Start for 3.15. CLZ8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief CLZ8 (SIMD 8-bit Count Leading Zero)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLZ8 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading zero bits of the 8-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 8-bit elements of Rs1, this instruction
+ * counts the number of leading zero bits and writes the results to the corresponding 8-bit elements of
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.B[x];
+ * cnt[x] = 0;
+ * for (i = 7 to 0) {
+ *   if (snum[x](i) == 0) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.B[x] = cnt[x];
+ * for RV32: x=3...0
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLZ8(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clz8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.15. CLZ8 ===== */
+
+/* ===== Inline Function Start for 3.16. CLZ16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief CLZ16 (SIMD 16-bit Count Leading Zero)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLZ16 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading zero bits of the 16-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 16-bit elements of Rs1, this instruction
+ * counts the number of leading zero bits and writes the results to the corresponding 16-bit elements
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.H[x];
+ * cnt[x] = 0;
+ * for (i = 15 to 0) {
+ *   if (snum[x](i) == 0) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.H[x] = cnt[x];
+ * for RV32: x=1...0
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLZ16(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clz16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.16. CLZ16 ===== */
+
+/* ===== Inline Function Start for 3.17. CLZ32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief CLZ32 (SIMD 32-bit Count Leading Zero)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CLZ32 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Count the number of leading zero bits of the 32-bit elements of a general register.
+ *
+ * **Description**:\n
+ * Starting from the most significant bits of the 32-bit elements of Rs1, this instruction
+ * counts the number of leading zero bits and writes the results to the corresponding 32-bit elements
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * snum[x] = Rs1.W[x];
+ * cnt[x] = 0;
+ * for (i = 31 to 0) {
+ *   if (snum[x](i) == 0) {
+ *     cnt[x] = cnt[x] + 1;
+ *   } else {
+ *     break;
+ *   }
+ * }
+ * Rd.W[x] = cnt[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CLZ32(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("clz32 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.17. CLZ32 ===== */
+
+/* ===== Inline Function Start for 3.18. CMPEQ8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
+ * \brief CMPEQ8 (SIMD 8-bit Integer Compare Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CMPEQ8 Rs, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit integer elements equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit integer elements in Rs1 with the 8-bit integer
+ * elements in Rs2 to see if they are equal. If they are equal, the result is 0xFF; otherwise, the result is
+ * 0x0. The 8-bit element comparison results are written to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned numbers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] == Rs2.B[x])? 0xff : 0x0;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CMPEQ8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("cmpeq8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.18. CMPEQ8 ===== */
+
+/* ===== Inline Function Start for 3.19. CMPEQ16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
+ * \brief CMPEQ16 (SIMD 16-bit Integer Compare Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CMPEQ16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer elements equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit integer elements in Rs1 with the 16-bit integer
+ * elements in Rs2 to see if they are equal. If they are equal, the result is 0xFFFF; otherwise, the result
+ * is 0x0. The 16-bit element comparison results are written to Rt.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned numbers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] == Rs2.H[x])? 0xffff : 0x0;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CMPEQ16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("cmpeq16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.19. CMPEQ16 ===== */
+
+/* ===== Inline Function Start for 3.20. CRAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief CRAS16 (SIMD 16-bit Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CRAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element addition and 16-bit integer element subtraction in a 32-bit
+ * chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit integer element in [31:16] of 32-bit chunks in Rs1 with
+ * the 16-bit integer element in [15:0] of 32-bit chunks in Rs2, and writes the result to [31:16] of 32-bit
+ * chunks in Rd; at the same time, it subtracts the 16-bit integer element in [31:16] of 32-bit chunks in
+ * Rs2 from the 16-bit integer element in [15:0] of 32-bit chunks, and writes the result to [15:0] of 32-
+ * bit chunks in Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = Rs1.W[x][31:16] + Rs2.W[x][15:0];
+ * Rd.W[x][15:0] = Rs1.W[x][15:0] - Rs2.W[x][31:16];
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CRAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("cras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.20. CRAS16 ===== */
+
+/* ===== Inline Function Start for 3.21. CRSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief CRSA16 (SIMD 16-bit Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element subtraction and 16-bit integer element addition in a 32-bit
+ * chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit integer element in [15:0] of 32-bit chunks in Rs2
+ * from the 16-bit integer element in [31:16] of 32-bit chunks in Rs1, and writes the result to [31:16] of
+ * 32-bit chunks in Rd; at the same time, it adds the 16-bit integer element in [31:16] of 32-bit chunks
+ * in Rs2 with the 16-bit integer element in [15:0] of 32-bit chunks in Rs1, and writes the result to
+ * [15:0] of 32-bit chunks in Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = Rs1.W[x][31:16] - Rs2.W[x][15:0];
+ * Rd.W[x][15:0] = Rs1.W[x][15:0] + Rs2.W[x][31:16];
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CRSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("crsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.21. CRSA16 ===== */
+
+/* ===== Inline Function Start for 3.22. INSB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief INSB (Insert Byte)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * (RV32) INSB Rd, Rs1, imm[1:0]
+ * (RV64) INSB Rd, Rs1, imm[2:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Insert byte 0 of a 32-bit or 64-bit register into one of the byte elements of another register.
+ *
+ * **Description**:\n
+ * This instruction inserts byte 0 of Rs1 into byte `imm[1:0]` (RV32) or `imm[2:0]` (RV64)
+ * of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * bpos = imm[1:0]; (RV32)
+ * bpos = imm[2:0]; (RV64)
+ * Rd.B[bpos] = Rs1.B[0]
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_INSB(t, a, b)    \
+    ({    \
+        register unsigned long __t = (unsigned long)(t);    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("insb %0, %1, %2" : "+r"(__t) : "r"(__a), "K"(b));    \
+        __t;    \
+    })
+/* ===== Inline Function End for 3.22. INSB ===== */
+
+/* ===== Inline Function Start for 3.23. KABS8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief KABS8 (SIMD 8-bit Saturating Absolute)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KABS8 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of 8-bit signed integer elements simultaneously.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of 8-bit signed integer elements stored
+ * in Rs1 and writes the element results to Rd. If the input number is 0x80, this instruction generates
+ * 0x7f as the output and sets the OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.B[x];
+ * if (src == 0x80) {
+ *   src = 0x7f;
+ *   OV = 1;
+ * } else if (src[7] == 1)
+ *   src = -src;
+ * }
+ * Rd.B[x] = src;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KABS8(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("kabs8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.23. KABS8 ===== */
+
+/* ===== Inline Function Start for 3.24. KABS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief KABS16 (SIMD 16-bit Saturating Absolute)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KABS16 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of 16-bit signed integer elements simultaneously.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of 16-bit signed integer elements stored
+ * in Rs1 and writes the element results to Rd. If the input number is 0x8000, this instruction
+ * generates 0x7fff as the output and sets the OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.H[x];
+ * if (src == 0x8000) {
+ *   src = 0x7fff;
+ *   OV = 1;
+ * } else if (src[15] == 1)
+ *   src = -src;
+ * }
+ * Rd.H[x] = src;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KABS16(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("kabs16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.24. KABS16 ===== */
+
+/* ===== Inline Function Start for 3.25. KABSW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KABSW (Scalar 32-bit Absolute Value with Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KABSW Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of a signed 32-bit integer in a general register.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of a signed 32-bit integer stored in Rs1.
+ * The result is sign-extended (for RV64) and written to Rd. This instruction with the minimum
+ * negative integer input of 0x80000000 will produce a saturated output of maximum positive integer
+ * of 0x7fffffff and the OV flag will be set to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs1.W[0] >= 0) {
+ *   res = Rs1.W[0];
+ * } else {
+ *   If (Rs1.W[0] == 0x80000000) {
+ *     res = 0x7fffffff;
+ *     OV = 1;
+ *   } else {
+ *     res = -Rs1.W[0];
+ *   }
+ * }
+ * Rd = SE32(res);
+ * ~~~
+ *
+ * \param [in]  a    signed long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KABSW(signed long a)
+{
+    register unsigned long result;
+    __ASM volatile("kabsw %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.25. KABSW ===== */
+
+/* ===== Inline Function Start for 3.26. KADD8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief KADD8 (SIMD 8-bit Signed Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADD8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed
+ * integer elements in Rs2. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1), they
+ * are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] + Rs2.B[x];
+ * if (res[x] > 127) {
+ *   res[x] = 127;
+ *   OV = 1;
+ * } else if (res[x] < -128) {
+ *   res[x] = -128;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KADD8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.26. KADD8 ===== */
+
+/* ===== Inline Function Start for 3.27. KADD16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KADD16 (SIMD 16-bit Signed Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADD16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed
+ * integer elements in Rs2. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1),
+ * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] + Rs2.H[x];
+ * if (res[x] > 32767) {
+ *   res[x] = 32767;
+ *   OV = 1;
+ * } else if (res[x] < -32768) {
+ *   res[x] = -32768;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KADD16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.27. KADD16 ===== */
+
+/* ===== Inline Function Start for 3.28. KADD64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief KADD64 (64-bit Signed Saturating Addition)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADD64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add two 64-bit signed integers. The result is saturated to the Q63 range.
+ *
+ * **RV32 Description**:\n
+ * This instruction adds the 64-bit signed integer of an even/odd pair of registers
+ * specified by Rs1(4,1) with the 64-bit signed integer of an even/odd pair of registers specified by
+ * Rs2(4,1). If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the
+ * range and the OV bit is set to 1. The saturated result is written to an even/odd pair of registers
+ * specified by Rd(4,1).
+ * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction adds the 64-bit signed integer in Rs1 with the 64-bit signed
+ * integer in Rs2. If the result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the
+ * range and the OV bit is set to 1. The saturated result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ *  t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ *  a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ *  b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ *  result = R[a_H].R[a_L] + R[b_H].R[b_L];
+ *  if (result > (2^63)-1) {
+ *    result = (2^63)-1; OV = 1;
+ *  } else if (result < -2^63) {
+ *    result = -2^63; OV = 1;
+ *  }
+ *  R[t_H].R[t_L] = result;
+ * RV64:
+ *  result = Rs1 + Rs2;
+ *  if (result > (2^63)-1) {
+ *    result = (2^63)-1; OV = 1;
+ *  } else if (result < -2^63) {
+ *    result = -2^63; OV = 1;
+ *  }
+ *  Rd = result;
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    long long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_KADD64(long long a, long long b)
+{
+    register long long result;
+    __ASM volatile("kadd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.28. KADD64 ===== */
+
+/* ===== Inline Function Start for 3.29. KADDH ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief KADDH (Signed Addition with Q15 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADDH Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add the signed lower 32-bit content of two registers with Q15 saturation.
+ *
+ * **Description**:\n
+ * The signed lower 32-bit content of Rs1 is added with the signed lower 32-bit content of
+ * Rs2. And the result is saturated to the 16-bit signed integer range of [-2^15, 2^15-1] and then sign-
+ * extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] + Rs2.W[0];
+ * if (tmp > 32767) {
+ *   res = 32767;
+ *   OV = 1;
+ * } else if (tmp < -32768) {
+ *   res = -32768;
+ *   OV = 1
+ * } else {
+ *   res = tmp;
+ * }
+ * Rd = SE(tmp[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KADDH(int a, int b)
+{
+    register long result;
+    __ASM volatile("kaddh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.29. KADDH ===== */
+
+/* ===== Inline Function Start for 3.30. KADDW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KADDW (Signed Addition with Q31 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADDW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add the lower 32-bit signed content of two registers with Q31 saturation.
+ *
+ * **Description**:\n
+ * The lower 32-bit signed content of Rs1 is added with the lower 32-bit signed content of
+ * Rs2. And the result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1] and then sign-
+ * extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] + Rs2.W[0];
+ * if (tmp > (2^31)-1) {
+ *   res = (2^31)-1;
+ *   OV = 1;
+ * } else if (tmp < -2^31) {
+ *   res = -2^31;
+ *   OV = 1
+ * } else {
+ *   res = tmp;
+ * }
+ * Rd = res[31:0]; // RV32
+ * Rd = SE(res[31:0]) // RV64
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KADDW(int a, int b)
+{
+    register long result;
+    __ASM volatile("kaddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.30. KADDW ===== */
+
+/* ===== Inline Function Start for 3.31. KCRAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KCRAS16 (SIMD 16-bit Signed Saturating Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KCRAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating addition and 16-bit signed integer element
+ * saturating subtraction in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-
+ * bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2; at the same time, it
+ * subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit signed
+ * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number
+ * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit chunks in Rd for
+ * subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] + Rs2.W[x][15:0];
+ * res2 = Rs1.W[x][15:0] - Rs2.W[x][31:16];
+ * for (res in [res1, res2]) {
+ *   if (res > (2^15)-1) {
+ *     res = (2^15)-1;
+ *     OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = -2^15;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KCRAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.31. KCRAS16 ===== */
+
+/* ===== Inline Function Start for 3.32. KCRSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KCRSA16 (SIMD 16-bit Signed Saturating Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KCRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating subtraction and 16-bit signed integer element
+ * saturating addition in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit
+ * chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks
+ * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1; at the same time, it
+ * adds the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2 with the 16-bit signed
+ * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number
+ * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks in Rd
+ * for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] - Rs2.W[x][15:0];
+ * res2 = Rs1.W[x][15:0] + Rs2.W[x][31:16];
+ * for (res in [res1, res2]) {
+ *   if (res > (2^15)-1) {
+ *     res = (2^15)-1;
+ *     OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = -2^15;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KCRSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.32. KCRSA16 ===== */
+
+/* ===== Inline Function Start for 3.33.1. KDMBB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMBB (Signed Saturating Double Multiply B16 x B16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is
+ * written into the destination register for RV32 or sign-extended to 64-bits and written into the
+ * destination register for RV64. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in
+ * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be
+ * saturated to 0x7FFFFFFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ *   OV = 1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMBB(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("kdmbb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.33.1. KDMBB ===== */
+
+/* ===== Inline Function Start for 3.33.2. KDMBT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMBT (Signed Saturating Double Multiply B16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is
+ * written into the destination register for RV32 or sign-extended to 64-bits and written into the
+ * destination register for RV64. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in
+ * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be
+ * saturated to 0x7FFFFFFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ *   OV = 1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMBT(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("kdmbt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.33.2. KDMBT ===== */
+
+/* ===== Inline Function Start for 3.33.3. KDMTT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMTT (Signed Saturating Double Multiply T16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result. The result is
+ * written into the destination register for RV32 or sign-extended to 64-bits and written into the
+ * destination register for RV64. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then written into Rd (sign-extended in
+ * RV64). When both the two Q15 inputs are 0x8000, saturation will happen. The result will be
+ * saturated to 0x7FFFFFFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   Rd = resQ31; // RV32
+ *   Rd = SE(resQ31); // RV64
+ *   OV = 1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMTT(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("kdmtt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.33.3. KDMTT ===== */
+
+/* ===== Inline Function Start for 3.34.1. KDMABB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMABB (Signed Saturating Double Multiply Addition B16 x B16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result
+ * with the sign-extended lower 32-bit chunk destination register and write the saturated addition
+ * result into the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV flag is set to 1. The result after saturation is written to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd = Rd + resQ31; // RV32
+ * resadd = Rd.W[0] + resQ31; // RV64
+ * if (resadd > (2^31)-1) {
+ *   resadd = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd < -2^31) {
+ *   resadd = -2^31;
+ *   OV = 1;
+ * }
+ * Rd = resadd; // RV32
+ * Rd = SE(resadd); // RV64
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMABB(long t, unsigned int a, unsigned int b)
+{
+    __ASM volatile("kdmabb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.34.1. KDMABB ===== */
+
+/* ===== Inline Function Start for 3.34.2. KDMABT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMABT (Signed Saturating Double Multiply Addition B16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result
+ * with the sign-extended lower 32-bit chunk destination register and write the saturated addition
+ * result into the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV flag is set to 1. The result after saturation is written to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd = Rd + resQ31; // RV32
+ * resadd = Rd.W[0] + resQ31; // RV64
+ * if (resadd > (2^31)-1) {
+ *   resadd = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd < -2^31) {
+ *   resadd = -2^31;
+ *   OV = 1;
+ * }
+ * Rd = resadd; // RV32
+ * Rd = SE(resadd); // RV64
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMABT(long t, unsigned int a, unsigned int b)
+{
+    __ASM volatile("kdmabt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.34.2. KDMABT ===== */
+
+/* ===== Inline Function Start for 3.34.3. KDMATT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KDMATT (Signed Saturating Double Multiply Addition T16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then double and saturate the Q31 result, add the result
+ * with the sign-extended lower 32-bit chunk destination register and write the saturated addition
+ * result into the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then
+ * doubled and saturated into a Q31 value. The Q31 value is then added with the content of Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV flag is set to 1. The result after saturation is written to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KDMABB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KDMABT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KDMATT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult = aop * bop;
+ *   resQ31 = Mresult << 1;
+ * } else {
+ *   resQ31 = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd = Rd + resQ31; // RV32
+ * resadd = Rd.W[0] + resQ31; // RV64
+ * if (resadd > (2^31)-1) {
+ *   resadd = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd < -2^31) {
+ *   resadd = -2^31;
+ *   OV = 1;
+ * }
+ * Rd = resadd; // RV32
+ * Rd = SE(resadd); // RV64
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KDMATT(long t, unsigned int a, unsigned int b)
+{
+    __ASM volatile("kdmatt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.34.3. KDMATT ===== */
+
+/* ===== Inline Function Start for 3.35.1. KHM8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief KHM8 (SIMD Signed Saturating Q7 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHM8 Rd, Rs1, Rs2
+ * KHMX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7
+ * numbers again.
+ *
+ * **Description**:\n
+ * For the `KHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1
+ * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
+ * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2.
+ * For the `KHMX16` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 with the
+ * bottom 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
+ * content of 16-bit chunks in Rs1 with the top 8-bit Q7 content of 16-bit chunks in Rs2.
+ * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then
+ * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen.
+ * The result will be saturated to 0x7F and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (is `KHM8`) {
+ *   op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top
+ *   op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom
+ * } else if (is `KHMX8`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x80 != aop | 0x80 != bop) {
+ *     res = (aop s* bop) >> 7;
+ *   } else {
+ *     res= 0x7F;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.H[x/2] = concat(rest, resb);
+ * for RV32, x=0,2
+ * for RV64, x=0,2,4,6
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHM8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khm8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.35.1. KHM8 ===== */
+
+/* ===== Inline Function Start for 3.35.2. KHMX8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief KHMX8 (SIMD Signed Saturating Crossed Q7 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHM8 Rd, Rs1, Rs2
+ * KHMX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7
+ * numbers again.
+ *
+ * **Description**:\n
+ * For the `KHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1
+ * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
+ * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2.
+ * For the `KHMX16` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1 with the
+ * bottom 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
+ * content of 16-bit chunks in Rs1 with the top 8-bit Q7 content of 16-bit chunks in Rs2.
+ * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then
+ * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen.
+ * The result will be saturated to 0x7F and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (is `KHM8`) {
+ *   op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top
+ *   op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom
+ * } else if (is `KHMX8`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x80 != aop | 0x80 != bop) {
+ *     res = (aop s* bop) >> 7;
+ *   } else {
+ *     res= 0x7F;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.H[x/2] = concat(rest, resb);
+ * for RV32, x=0,2
+ * for RV64, x=0,2,4,6
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHMX8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khmx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.35.2. KHMX8 ===== */
+
+/* ===== Inline Function Start for 3.36.1. KHM16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief KHM16 (SIMD Signed Saturating Q15 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHM16 Rd, Rs1, Rs2
+ * KHMX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to
+ * Q15 numbers again.
+ *
+ * **Description**:\n
+ * For the `KHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in
+ * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom
+ * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in
+ * Rs2.
+ * For the `KHMX16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the
+ * bottom 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom 16-bit Q15
+ * content of 32-bit chunks in Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2.
+ * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are
+ * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will
+ * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (is `KHM16`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom
+ * } else if (is `KHMX16`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x8000 != aop | 0x8000 != bop) {
+ *     res = (aop s* bop) >> 15;
+ *   } else {
+ *     res= 0x7FFF;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x/2] = concat(rest, resb);
+ * for RV32: x=0
+ * for RV64: x=0,2
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHM16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khm16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.36.1. KHM16 ===== */
+
+/* ===== Inline Function Start for 3.36.2. KHMX16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief KHMX16 (SIMD Signed Saturating Crossed Q15 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHM16 Rd, Rs1, Rs2
+ * KHMX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to
+ * Q15 numbers again.
+ *
+ * **Description**:\n
+ * For the `KHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in
+ * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom
+ * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in
+ * Rs2.
+ * For the `KHMX16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in Rs1 with the
+ * bottom 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom 16-bit Q15
+ * content of 32-bit chunks in Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2.
+ * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are
+ * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will
+ * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (is `KHM16`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom
+ * } else if (is `KHMX16`) {
+ *   op1t = Rs1.H[x+1]; op2t = Rs2.H[x]; // Rs1 top
+ *   op1b = Rs1.H[x]; op2b = Rs2.H[x+1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x8000 != aop | 0x8000 != bop) {
+ *     res = (aop s* bop) >> 15;
+ *   } else {
+ *     res= 0x7FFF;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x/2] = concat(rest, resb);
+ * for RV32: x=0
+ * for RV64: x=0,2
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHMX16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khmx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.36.2. KHMX16 ===== */
+
+/* ===== Inline Function Start for 3.37.1. KHMBB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief KHMBB (Signed Saturating Half Multiply B16 x B16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15
+ * number again and saturate the Q15 result into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right-
+ * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd = SE32(res[15:0]); // Rv32
+ * Rd = SE64(res[15:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KHMBB(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("khmbb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.37.1. KHMBB ===== */
+
+/* ===== Inline Function Start for 3.37.2. KHMBT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief KHMBT (Signed Saturating Half Multiply B16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15
+ * number again and saturate the Q15 result into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right-
+ * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd = SE32(res[15:0]); // Rv32
+ * Rd = SE64(res[15:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KHMBT(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("khmbt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.37.2. KHMBT ===== */
+
+/* ===== Inline Function Start for 3.37.3. KHMTT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief KHMTT (Signed Saturating Half Multiply T16 x T16)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 number contents of two 16-bit data in the corresponding portion
+ * of the lower 32-bit chunk in registers and then right-shift 15 bits to turn the Q30 result into a Q15
+ * number again and saturate the Q15 result into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs1 with
+ * the top or bottom 16-bit Q15 content of the lower 32-bit portion in Rs2. The Q30 result is then right-
+ * shifted 15-bits and saturated into a Q15 value. The Q15 value is then sing-extended and written into
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * aop = Rs1.H[0]; bop = Rs2.H[0]; // KHMBB
+ * aop = Rs1.H[0]; bop = Rs2.H[1]; // KHMBT
+ * aop = Rs1.H[1]; bop = Rs2.H[1]; // KHMTT
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd = SE32(res[15:0]); // Rv32
+ * Rd = SE64(res[15:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KHMTT(unsigned int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("khmtt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.37.3. KHMTT ===== */
+
+/* ===== Inline Function Start for 3.38.1. KMABB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMABB (SIMD Saturating Signed Multiply Bottom Halfs & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB Rd, Rs1, Rs2
+ * KMABT Rd, Rs1, Rs2
+ * KMATT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content
+ * of 32-bit elements in another register and add the result to the content of 32-bit elements in the
+ * third register. The addition result may be saturated and is written to the third register.
+ * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element)
+ * * KMABT rd.W[x] + bottom*top (per 32-bit element)
+ * * KMATT rd.W[x] + top*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2.
+ * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2.
+ * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2.
+ * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is
+ * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to
+ * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMABB(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmabb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.38.1. KMABB ===== */
+
+/* ===== Inline Function Start for 3.38.2. KMABT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMABT (SIMD Saturating Signed Multiply Bottom & Top Halfs & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB Rd, Rs1, Rs2
+ * KMABT Rd, Rs1, Rs2
+ * KMATT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content
+ * of 32-bit elements in another register and add the result to the content of 32-bit elements in the
+ * third register. The addition result may be saturated and is written to the third register.
+ * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element)
+ * * KMABT rd.W[x] + bottom*top (per 32-bit element)
+ * * KMATT rd.W[x] + top*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2.
+ * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2.
+ * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2.
+ * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is
+ * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to
+ * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMABT(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmabt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.38.2. KMABT ===== */
+
+/* ===== Inline Function Start for 3.38.3. KMATT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMATT (SIMD Saturating Signed Multiply Top Halfs & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB Rd, Rs1, Rs2
+ * KMABT Rd, Rs1, Rs2
+ * KMATT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of 32-bit elements in a register with the 16-bit content
+ * of 32-bit elements in another register and add the result to the content of 32-bit elements in the
+ * third register. The addition result may be saturated and is written to the third register.
+ * * KMABB: rd.W[x] + bottom*bottom (per 32-bit element)
+ * * KMABT rd.W[x] + bottom*top (per 32-bit element)
+ * * KMATT rd.W[x] + top*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMABB` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2.
+ * For the `KMABT` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2.
+ * For the `KMATT` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2.
+ * The multiplication result is added to the content of 32-bit elements in Rd. If the addition result is
+ * beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to
+ * 1. The results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]); // KMABB
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[1]); // KMABT
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]); // KMATT
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMATT(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmatt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.38.3. KMATT ===== */
+
+/* ===== Inline Function Start for 3.39.1. KMADA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMADA (SIMD Saturating Signed Multiply Two Halfs and Two Adds)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADA Rd, Rs1, Rs2
+ * KMAXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then adds
+ * the two 32-bit results and 32-bit elements in a third register together. The addition result may be
+ * saturated.
+ * * KMADA: rd.W[x] + top*top + bottom*bottom (per 32-bit element)
+ * * KMAXDA: rd.W[x] + top*bottom + bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMADA instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
+ * elements in Rs2.
+ * For the `KMAXDA` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of multiplying
+ * the bottom 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit elements in
+ * Rs2.
+ * The result is added to the content of 32-bit elements in Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The 32-bit
+ * results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMADA
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMAXDA
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) + (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ * OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADA(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmada %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.39.1. KMADA ===== */
+
+/* ===== Inline Function Start for 3.39.2. KMAXDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMAXDA (SIMD Saturating Signed Crossed Multiply Two Halfs and Two Adds)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADA Rd, Rs1, Rs2
+ * KMAXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then adds
+ * the two 32-bit results and 32-bit elements in a third register together. The addition result may be
+ * saturated.
+ * * KMADA: rd.W[x] + top*top + bottom*bottom (per 32-bit element)
+ * * KMAXDA: rd.W[x] + top*bottom + bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMADA instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
+ * elements in Rs2.
+ * For the `KMAXDA` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * bottom 16-bit content of 32-bit elements in Rs2 and then adds the result to the result of multiplying
+ * the bottom 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit elements in
+ * Rs2.
+ * The result is added to the content of 32-bit elements in Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The 32-bit
+ * results after saturation are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMADA
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMAXDA
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) + (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ * OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMAXDA(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmaxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.39.2. KMAXDA ===== */
+
+/* ===== Inline Function Start for 3.40.1. KMADS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMADS (SIMD Saturating Signed Multiply Two Halfs & Subtract & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS Rd, Rs1, Rs2
+ * KMADRS Rd, Rs1, Rs2
+ * KMAXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the corresponding 32-bit elements in a third register. The addition result may be saturated.
+ * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element)
+ * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element)
+ * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
+ * elements in Rs2.
+ * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-
+ * bit elements in Rs2.
+ * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit
+ * elements in Rs2.
+ * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1
+ * and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMADS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMADRS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * // KMAXDS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADS(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmads %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.40.1. KMADS ===== */
+
+/* ===== Inline Function Start for 3.40.2. KMADRS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMADRS (SIMD Saturating Signed Multiply Two Halfs & Reverse Subtract & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS Rd, Rs1, Rs2
+ * KMADRS Rd, Rs1, Rs2
+ * KMAXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the corresponding 32-bit elements in a third register. The addition result may be saturated.
+ * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element)
+ * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element)
+ * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
+ * elements in Rs2.
+ * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-
+ * bit elements in Rs2.
+ * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit
+ * elements in Rs2.
+ * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1
+ * and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMADS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMADRS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * // KMAXDS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADRS(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmadrs %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.40.2. KMADRS ===== */
+
+/* ===== Inline Function Start for 3.40.3. KMAXDS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMAXDS (SIMD Saturating Signed Crossed Multiply Two Halfs & Subtract & Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS Rd, Rs1, Rs2
+ * KMADRS Rd, Rs1, Rs2
+ * KMAXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the corresponding 32-bit elements in a third register. The addition result may be saturated.
+ * * KMADS: rd.W[x] + (top*top - bottom*bottom) (per 32-bit element)
+ * * KMADRS: rd.W[x] + (bottom*bottom - top*top) (per 32-bit element)
+ * * KMAXDS: rd.W[x] + (top*bottom - bottom*top) (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMADS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the bottom 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the top 16-bit content of 32-bit
+ * elements in Rs2.
+ * For the `KMADRS` instruction, it multiplies the top 16-bit content of 32-bit elements in Rs1 with the
+ * top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-
+ * bit elements in Rs2.
+ * For the `KMAXDS` instruction, it multiplies the bottom 16-bit content of 32-bit elements in Rs1 with
+ * the top 16-bit content of 32-bit elements in Rs2 and then subtracts the result from the result of
+ * multiplying the top 16-bit content of 32-bit elements in Rs1 with the bottom 16-bit content of 32-bit
+ * elements in Rs2.
+ * The subtraction result is then added to the content of the corresponding 32-bit elements in Rd. If the
+ * addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and
+ * the OV bit is set to 1. The 32-bit results after saturation are written to Rd. The 16-bit contents of Rs1
+ * and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMADS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMADRS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * // KMAXDS
+ * res[x] = Rd.W[x] + (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMAXDS(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmaxds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.40.3. KMAXDS ===== */
+
+/* ===== Inline Function Start for 3.41. KMAR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief KMAR64 (Signed Multiply and Saturating Add to 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMAR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed elements in two registers and add the 64-bit multiplication
+ * results to the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is
+ * saturated to the Q63 range and written back to the pair of registers (RV32) or the register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It adds
+ * the 64-bit multiplication result to the 64-bit signed data of an even/odd pair of registers specified by
+ * Rd(4,1) with unlimited precision. If the 64-bit addition result is beyond the Q63 number range (-2^63 <=
+ * Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The saturated result is written back
+ * to the even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
+ * adds the 64-bit multiplication results to the 64-bit signed data of Rd with unlimited precision. If the
+ * 64-bit addition result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range
+ * and the OV bit is set to 1. The saturated result is written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * result = R[t_H].R[t_L] + (Rs1 * Rs2);
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * RV64:
+ * // `result` has unlimited precision
+ * result = Rd + (Rs1.W[0] * Rs2.W[0]) + (Rs1.W[1] * Rs2.W[1]);
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_KMAR64(long long t, long a, long b)
+{
+    __ASM volatile("kmar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.41. KMAR64 ===== */
+
+/* ===== Inline Function Start for 3.42.1. KMDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMDA (SIMD Signed Multiply Two Halfs and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMDA Rd, Rs1, Rs2
+ * KMXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * adds the two 32-bit results together. The addition result may be saturated.
+ * * KMDA: top*top + bottom*bottom (per 32-bit element)
+ * * KMXDA: top*bottom + bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
+ * bit elements of Rs2.
+ * For the `KMXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of the
+ * 32-bit elements of Rs2.
+ * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^31-1.
+ * The final results are written to Rd. The 16-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if  Rs1.W[x]  !=  0x80008000)  or  (Rs2.W[x]  !=  0x80008000  {  //  KMDA  Rd.W[x]  =  Rs1.W[x].H[1]  *
+ * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]; // KMXDA Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[0])
+ * +  (Rs1.W[x].H[0]  *  Rs2.W[x].H[1];  }  else  {  Rd.W[x]  =  0x7fffffff;  OV  =  1;  }  for  RV32:  x=0  for  RV64:
+ * x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMDA(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmda %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.42.1. KMDA ===== */
+
+/* ===== Inline Function Start for 3.42.2. KMXDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMXDA (SIMD Signed Crossed Multiply Two Halfs and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMDA Rd, Rs1, Rs2
+ * KMXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * adds the two 32-bit results together. The addition result may be saturated.
+ * * KMDA: top*top + bottom*bottom (per 32-bit element)
+ * * KMXDA: top*bottom + bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
+ * bit elements of Rs2.
+ * For the `KMXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of the
+ * 32-bit elements of Rs2.
+ * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^31-1.
+ * The final results are written to Rd. The 16-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if  Rs1.W[x]  !=  0x80008000)  or  (Rs2.W[x]  !=  0x80008000  {  //  KMDA  Rd.W[x]  =  Rs1.W[x].H[1]  *
+ * Rs2.W[x].H[1]) + (Rs1.W[x].H[0] * Rs2.W[x].H[0]; // KMXDA Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[0])
+ * +  (Rs1.W[x].H[0]  *  Rs2.W[x].H[1];  }  else  {  Rd.W[x]  =  0x7fffffff;  OV  =  1;  }  for  RV32:  x=0  for  RV64:
+ * x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMXDA(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmxda %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.42.2. KMXDA ===== */
+
+/* ===== Inline Function Start for 3.43.1. KMMAC ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KMMAC (SIMD Saturating MSW Signed Multiply Word and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAC Rd, Rs1, Rs2
+ * KMMAC.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers and add the most significant
+ * 32-bit results with the signed 32-bit integer elements of a third register. The addition results are
+ * saturated first and then written back to the third register. The `.u` form performs an additional
+ * rounding up operation on the multiplication results before adding the most significant 32-bit part
+ * of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
+ * and adds the most significant 32-bit multiplication results with the signed 32-bit elements of Rd. If
+ * the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range
+ * and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
+ * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
+ * adding a 1 to bit 31 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][63:32];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAC(long t, long a, long b)
+{
+    __ASM volatile("kmmac %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.43.1. KMMAC ===== */
+
+/* ===== Inline Function Start for 3.43.2. KMMAC.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KMMAC.u (SIMD Saturating MSW Signed Multiply Word and Add with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAC Rd, Rs1, Rs2
+ * KMMAC.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers and add the most significant
+ * 32-bit results with the signed 32-bit integer elements of a third register. The addition results are
+ * saturated first and then written back to the third register. The `.u` form performs an additional
+ * rounding up operation on the multiplication results before adding the most significant 32-bit part
+ * of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
+ * and adds the most significant 32-bit multiplication results with the signed 32-bit elements of Rd. If
+ * the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range
+ * and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
+ * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
+ * adding a 1 to bit 31 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][63:32];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAC_U(long t, long a, long b)
+{
+    __ASM volatile("kmmac.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.43.2. KMMAC.u ===== */
+
+/* ===== Inline Function Start for 3.44.1. KMMAWB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWB (SIMD Saturating MSW Signed Multiply Word and Bottom Half and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWB Rd, Rs1, Rs2
+ * KMMAWB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register and add the most significant 32-bit results with
+ * the corresponding signed 32-bit elements of a third register. The addition result is written to the
+ * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
+ * results from the most significant discarded bit before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
+ * of the corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication
+ * results with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
+ * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
+ * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
+ * bit 15 of the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][47:16];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWB(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.44.1. KMMAWB ===== */
+
+/* ===== Inline Function Start for 3.44.2. KMMAWB.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWB.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half and Add with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWB Rd, Rs1, Rs2
+ * KMMAWB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register and add the most significant 32-bit results with
+ * the corresponding signed 32-bit elements of a third register. The addition result is written to the
+ * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
+ * results from the most significant discarded bit before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
+ * of the corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication
+ * results with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
+ * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
+ * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
+ * bit 15 of the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][47:16];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWB_U(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawb.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.44.2. KMMAWB.u ===== */
+
+/* ===== Inline Function Start for 3.45.1. KMMAWB2 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWB2 (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWB2 Rd, Rs1, Rs2
+ * KMMAWB2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and add the
+ * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
+ * register. The saturated addition result is written to the corresponding 32-bit elements of the third
+ * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
+ * before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
+ * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
+ * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
+ * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
+ * the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
+ *   addop.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
+ *   if (`.u` form) {
+ *     Mres[x][47:14] = Mres[x][47:14] + 1;
+ *   }
+ *   addop.W[x] = Mres[x][46:15]; // doubling
+ * }
+ * res[x] = Rd.W[x] + addop.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWB2(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawb2 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.45.1. KMMAWB2 ===== */
+
+/* ===== Inline Function Start for 3.45.2. KMMAWB2.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWB2.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 and Add with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWB2 Rd, Rs1, Rs2
+ * KMMAWB2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and add the
+ * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
+ * register. The saturated addition result is written to the corresponding 32-bit elements of the third
+ * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
+ * before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
+ * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
+ * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
+ * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
+ * the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
+ *   addop.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
+ *   if (`.u` form) {
+ *     Mres[x][47:14] = Mres[x][47:14] + 1;
+ *   }
+ *   addop.W[x] = Mres[x][46:15]; // doubling
+ * }
+ * res[x] = Rd.W[x] + addop.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWB2_U(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawb2.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.45.2. KMMAWB2.u ===== */
+
+/* ===== Inline Function Start for 3.46.1. KMMAWT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWT (SIMD Saturating MSW Signed Multiply Word and Top Half and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWT Rd, Rs1, Rs2
+ * KMMAWT.u Rd Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the signed top 16-bit of the
+ * corresponding 32-bit elements of another register and add the most significant 32-bit results with
+ * the corresponding signed 32-bit elements of a third register. The addition results are written to the
+ * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
+ * results from the most significant discarded bit before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed top 16-bit of the
+ * corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication results
+ * with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
+ * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
+ * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
+ * bit 15 of the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][47:16];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWT(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.46.1. KMMAWT ===== */
+
+/* ===== Inline Function Start for 3.46.2. KMMAWT.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWT.u (SIMD Saturating MSW Signed Multiply Word and Top Half and Add with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWT Rd, Rs1, Rs2
+ * KMMAWT.u Rd Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the signed top 16-bit of the
+ * corresponding 32-bit elements of another register and add the most significant 32-bit results with
+ * the corresponding signed 32-bit elements of a third register. The addition results are written to the
+ * corresponding 32-bit elements of the third register. The `.u` form rounds up the multiplication
+ * results from the most significant discarded bit before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed top 16-bit of the
+ * corresponding 32-bit elements of Rs2 and adds the most significant 32-bit multiplication results
+ * with the corresponding signed 32-bit elements of Rd. If the addition result is beyond the Q31
+ * number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the range and the OV bit is set to 1. The results
+ * after saturation are written to the corresponding 32-bit elements of Rd. The `.u` form of the
+ * instruction rounds up the most significant 32-bit of the 48-bit multiplication results by adding a 1 to
+ * bit 15 of the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   res[x] = Rd.W[x] + Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] + Mres[x][47:16];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWT_U(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawt.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.46.2. KMMAWT.u ===== */
+
+/* ===== Inline Function Start for 3.47.1. KMMAWT2 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWT2 (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 and Add)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWT2 Rd, Rs1, Rs2
+ * KMMAWT2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and add the
+ * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
+ * register. The saturated addition result is written to the corresponding 32-bit elements of the third
+ * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
+ * before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
+ * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
+ * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
+ * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
+ * the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
+ *   addop.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
+ *   if (`.u` form) {
+ *     Mres[x][47:14] = Mres[x][47:14] + 1;
+ *   }
+ *   addop.W[x] = Mres[x][46:15]; // doubling
+ * }
+ * res[x] = Rd.W[x] + addop.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWT2(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawt2 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.47.1. KMMAWT2 ===== */
+
+/* ===== Inline Function Start for 3.47.2. KMMAWT2.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMAWT2.u (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 and Add with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMAWT2 Rd, Rs1, Rs2
+ * KMMAWT2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and add the
+ * saturated most significant 32-bit results with the corresponding signed 32-bit elements of a third
+ * register. The saturated addition result is written to the corresponding 32-bit elements of the third
+ * register. The `.u` form rounds up the multiplication results from the most significant discarded bit
+ * before the addition operations.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * adds the saturated most significant 32-bit Q31 multiplication results with the corresponding signed
+ * 32-bit elements of Rd. If the addition result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to the
+ * corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the most significant
+ * 32-bit of the 48-bit Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of
+ * the result before the addition operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
+ *   addop.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
+ *   if (`.u` form) {
+ *     Mres[x][47:14] = Mres[x][47:14] + 1;
+ *   }
+ *   addop.W[x] = Mres[x][46:15]; // doubling
+ * }
+ * res[x] = Rd.W[x] + addop.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMAWT2_U(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmmawt2.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.47.2. KMMAWT2.u ===== */
+
+/* ===== Inline Function Start for 3.48.1. KMMSB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KMMSB (SIMD Saturating MSW Signed Multiply Word and Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMSB Rd, Rs1, Rs2
+ * KMMSB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers and subtract the most
+ * significant 32-bit results from the signed 32-bit elements of a third register. The subtraction results
+ * are written to the third register. The `.u` form performs an additional rounding up operation on
+ * the multiplication results before subtracting the most significant 32-bit part of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
+ * and subtracts the most significant 32-bit multiplication results from the signed 32-bit elements of
+ * Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the
+ * range and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
+ * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
+ * adding a 1 to bit 31 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   res[x] = Rd.W[x] - Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] - Mres[x][63:32];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMSB(long t, long a, long b)
+{
+    __ASM volatile("kmmsb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.48.1. KMMSB ===== */
+
+/* ===== Inline Function Start for 3.48.2. KMMSB.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KMMSB.u (SIMD Saturating MSW Signed Multiply Word and Subtraction with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMSB Rd, Rs1, Rs2
+ * KMMSB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers and subtract the most
+ * significant 32-bit results from the signed 32-bit elements of a third register. The subtraction results
+ * are written to the third register. The `.u` form performs an additional rounding up operation on
+ * the multiplication results before subtracting the most significant 32-bit part of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed 32-bit elements of Rs2
+ * and subtracts the most significant 32-bit multiplication results from the signed 32-bit elements of
+ * Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is saturated to the
+ * range and the OV bit is set to 1. The results after saturation are written to Rd. The `.u` form of the
+ * instruction additionally rounds up the most significant 32-bit of the 64-bit multiplication results by
+ * adding a 1 to bit 31 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   res[x] = Rd.W[x] - Round[x][32:1];
+ * } else {
+ *   res[x] = Rd.W[x] - Mres[x][63:32];
+ * }
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMSB_U(long t, long a, long b)
+{
+    __ASM volatile("kmmsb.u %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.48.2. KMMSB.u ===== */
+
+/* ===== Inline Function Start for 3.49.1. KMMWB2 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMWB2 (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMWB2 Rd, Rs1, Rs2
+ * KMMWB2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and write the
+ * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
+ * form rounds up the results from the most significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
+ * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
+ * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
+ *   if (`.u` form) {
+ *     Round[x][32:0] = Mres[x][46:14] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][46:15];
+ *   }
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMWB2(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmmwb2 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.49.1. KMMWB2 ===== */
+
+/* ===== Inline Function Start for 3.49.2. KMMWB2.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMWB2.u (SIMD Saturating MSW Signed Multiply Word and Bottom Half & 2 with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMWB2 Rd, Rs1, Rs2
+ * KMMWB2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and write the
+ * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
+ * form rounds up the results from the most significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed bottom 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
+ * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
+ * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[0] == 0x8000)) {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[0];
+ *   if (`.u` form) {
+ *     Round[x][32:0] = Mres[x][46:14] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][46:15];
+ *   }
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMWB2_U(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmmwb2.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.49.2. KMMWB2.u ===== */
+
+/* ===== Inline Function Start for 3.50.1. KMMWT2 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMWT2 (SIMD Saturating MSW Signed Multiply Word and Top Half & 2)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMWT2 Rd, Rs1, Rs2
+ * KMMWT2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and write the
+ * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
+ * form rounds up the results from the most significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
+ * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
+ * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
+ *   if (`.u` form) {
+ *     Round[x][32:0] = Mres[x][46:14] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][46:15];
+ *   }
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMWT2(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmmwt2 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.50.1. KMMWT2 ===== */
+
+/* ===== Inline Function Start for 3.50.2. KMMWT2.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief KMMWT2.u (SIMD Saturating MSW Signed Multiply Word and Top Half & 2 with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMMWT2 Rd, Rs1, Rs2
+ * KMMWT2.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, double the multiplication results and write the
+ * saturated most significant 32-bit results to the corresponding 32-bit elements of a register. The `.u`
+ * form rounds up the results from the most significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit Q31 elements of Rs1 with the signed top 16-bit Q15
+ * content of the corresponding 32-bit elements of Rs2, doubles the Q46 results to Q47 numbers and
+ * writes the saturated most significant 32-bit Q31 multiplication results to the corresponding 32-bit
+ * elements of Rd. The `.u` form of the instruction rounds up the most significant 32-bit of the 48-bit
+ * Q47 multiplication results by adding a 1 to bit 15 (i.e., bit 14 before doubling) of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1.W[x] == 0x80000000) & (Rs2.W[x].H[1] == 0x8000)) {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * } else {
+ *   Mres[x][47:0] = Rs1.W[x] s* Rs2.W[x].H[1];
+ *   if (`.u` form) {
+ *     Round[x][32:0] = Mres[x][46:14] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][46:15];
+ *   }
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMMWT2_U(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmmwt2.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.50.2. KMMWT2.u ===== */
+
+/* ===== Inline Function Start for 3.51.1. KMSDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMSDA (SIMD Saturating Signed Multiply Two Halfs & Add & Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMSDA Rd, Rs1, Rs2
+ * KMSXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * subtracts the two 32-bit results from the corresponding 32-bit elements of a third register. The
+ * subtraction result may be saturated.
+ * * KMSDA: rd.W[x] - top*top - bottom*bottom (per 32-bit element)
+ * * KMSXDA: rd.W[x] - top*bottom - bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMSDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `KMSXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of the
+ * 32-bit elements of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * The two 32-bit multiplication results are then subtracted from the content of the corresponding 32-
+ * bit elements of Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to Rd. The
+ * 16-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMSDA
+ * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMSXDA
+ * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMSDA(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmsda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.51.1. KMSDA ===== */
+
+/* ===== Inline Function Start for 3.51.2. KMSXDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief KMSXDA (SIMD Saturating Signed Crossed Multiply Two Halfs & Add & Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMSDA Rd, Rs1, Rs2
+ * KMSXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * subtracts the two 32-bit results from the corresponding 32-bit elements of a third register. The
+ * subtraction result may be saturated.
+ * * KMSDA: rd.W[x] - top*top - bottom*bottom (per 32-bit element)
+ * * KMSXDA: rd.W[x] - top*bottom - bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `KMSDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `KMSXDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of the
+ * 32-bit elements of Rs1 with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * The two 32-bit multiplication results are then subtracted from the content of the corresponding 32-
+ * bit elements of Rd. If the subtraction result is beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1), it is
+ * saturated to the range and the OV bit is set to 1. The results after saturation are written to Rd. The
+ * 16-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KMSDA
+ * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * // KMSXDA
+ * res[x] = Rd.W[x] - (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMSXDA(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmsxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.51.2. KMSXDA ===== */
+
+/* ===== Inline Function Start for 3.52. KMSR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief KMSR64 (Signed Multiply and Saturating Subtract from 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMSR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed elements in two registers and subtract the 64-bit multiplication
+ * results from the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is
+ * saturated to the Q63 range and written back to the pair of registers (RV32) or the register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication result from the 64-bit signed data of an even/odd pair of registers
+ * specified by Rd(4,1) with unlimited precision. If the 64-bit subtraction result is beyond the Q63
+ * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The saturated
+ * result is written back to the even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication results from the 64-bit signed data in Rd with unlimited
+ * precision. If the 64-bit subtraction result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is
+ * saturated to the range and the OV bit is set to 1. The saturated result is written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * result = R[t_H].R[t_L] - (Rs1 * Rs2);
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * RV64:
+ * // `result` has unlimited precision
+ * result = Rd - (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]);
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_KMSR64(long long t, long a, long b)
+{
+    __ASM volatile("kmsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.52. KMSR64 ===== */
+
+/* ===== Inline Function Start for 3.53. KSLLW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KSLLW (Saturating Shift Left Logical for Word)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLLW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do logical left shift operation with saturation on a 32-bit word. The shift amount is a
+ * variable from a GPR.
+ *
+ * **Description**:\n
+ * The first word data in Rs1 is left-shifted logically. The shifted out bits are filled with
+ * zero and the shift amount is specified by the low-order 5-bits of the value in the Rs2 register. Any
+ * shifted value greater than 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated
+ * to -2^31. And the saturated result is sign-extended and written to Rd. If any saturation is performed,
+ * set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * res[(31+sa):0] = Rs1.W[0] << sa;
+ * if (res > (2^31)-1) {
+ *   res = 0x7fffffff; OV = 1;
+ * } else if (res < -2^31) {
+ *   res = 0x80000000; OV = 1;
+ * }
+ * Rd[31:0] = res[31:0]; // RV32
+ * Rd[63:0] = SE(res[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KSLLW(long a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("ksllw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.53. KSLLW ===== */
+
+/* ===== Inline Function Start for 3.54. KSLLIW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KSLLIW (Saturating Shift Left Logical Immediate for Word)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLLIW Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do logical left shift operation with saturation on a 32-bit word. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * The first word data in Rs1 is left-shifted logically. The shifted out bits are filled with
+ * zero and the shift amount is specified by the imm5u constant. Any shifted value greater than 2^31-1 is
+ * saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated to -2^31. And the saturated result is
+ * sign-extended and written to Rd. If any saturation is performed, set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u;
+ * res[(31+sa):0] = Rs1.W[0] << sa;
+ * if (res > (2^31)-1) {
+ *   res = 0x7fffffff; OV = 1;
+ * } else if (res < -2^31) {
+ *   res = 0x80000000; OV = 1;
+ * }
+ * Rd[31:0] = res[31:0]; // RV32
+ * Rd[63:0] = SE(res[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+#define __RV_KSLLIW(a, b)    \
+    ({    \
+        register long result;    \
+        register long __a = (long)(a);    \
+        __ASM volatile("kslliw %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.54. KSLLIW ===== */
+
+/* ===== Inline Function Start for 3.55. KSLL8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief KSLL8 (SIMD 8-bit Saturating Shift Left Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLL8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is a variable from a GPR.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the low-order 3-bits of the value in the Rs2 register.
+ * Any shifted value greater than 2^7-1 is saturated to 2^7-1. Any shifted value smaller than -2^7 is
+ * saturated to -2^7. And the saturated results are written to Rd. If any saturation is performed, set OV
+ * bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * if (sa != 0) {
+ *   res[(7+sa):0] = Rs1.B[x] << sa;
+ *   if (res > (2^7)-1) {
+ *     res = 0x7f; OV = 1;
+ *   } else if (res < -2^7) {
+ *     res = 0x80; OV = 1;
+ *   }
+ *   Rd.B[x] = res[7:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLL8(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ksll8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.55. KSLL8 ===== */
+
+/* ===== Inline Function Start for 3.56. KSLLI8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief KSLLI8 (SIMD 8-bit Saturating Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLLI8 Rd, Rs1, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is an immediate value.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the imm3u constant. Any shifted value greater than
+ * 2^7-1 is saturated to 2^7-1. Any shifted value smaller than -2^7 is saturated to -2^7. And the saturated
+ * results are written to Rd. If any saturation is performed, set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * if (sa != 0) {
+ *   res[(7+sa):0] = Rs1.B[x] << sa;
+ *   if (res > (2^7)-1) {
+ *     res = 0x7f; OV = 1;
+ *   } else if (res < -2^7) {
+ *     res = 0x80; OV = 1;
+ *   }
+ *   Rd.B[x] = res[7:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_KSLLI8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("kslli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.56. KSLLI8 ===== */
+
+/* ===== Inline Function Start for 3.57. KSLL16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief KSLL16 (SIMD 16-bit Saturating Shift Left Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLL16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is a variable from a GPR.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the low-order 4-bits of the value in the Rs2 register.
+ * Any shifted value greater than 2^15-1 is saturated to 2^15-1. Any shifted value smaller than -2^15 is
+ * saturated to -2^15. And the saturated results are written to Rd. If any saturation is performed, set OV
+ * bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * if (sa != 0) {
+ *   res[(15+sa):0] = Rs1.H[x] << sa;
+ *   if (res > (2^15)-1) {
+ *     res = 0x7fff; OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = 0x8000; OV = 1;
+ *   }
+ *   Rd.H[x] = res[15:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLL16(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ksll16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.57. KSLL16 ===== */
+
+/* ===== Inline Function Start for 3.58. KSLLI16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief KSLLI16 (SIMD 16-bit Saturating Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLLI16 Rd, Rs1, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is an immediate value.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the imm4u constant. Any shifted value greater than
+ * 2^15-1 is saturated to 2^15-1. Any shifted value smaller than -2^15 is saturated to -2^15. And the saturated
+ * results are written to Rd. If any saturation is performed, set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4u[3:0];
+ * if (sa != 0) {
+ *   res[(15+sa):0] = Rs1.H[x] << sa;
+ *   if (res > (2^15)-1) {
+ *     res = 0x7fff; OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = 0x8000; OV = 1;
+ *   }
+ *   Rd.H[x] = res[15:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_KSLLI16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("kslli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.58. KSLLI16 ===== */
+
+/* ===== Inline Function Start for 3.59.1. KSLRA8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief KSLRA8 (SIMD 8-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA8 Rd, Rs1, Rs2
+ * KSLRA8.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q7 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means
+ * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be
+ * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`.
+ * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1]. For the `.u` form
+ * of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[3:0] < 0) {
+ *   sa = -Rs2[3:0];
+ *   sa = (sa == 8)? 7 : sa;
+ *   if (`.u` form) {
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else {
+ *     Rd.B[x] = SE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[2:0];
+ *   res[(7+sa):0] = Rs1.B[x] <<(logic) sa;
+ *   if (res > (2^7)-1) {
+ *     res[7:0] = 0x7f; OV = 1;
+ *   } else if (res < -2^7) {
+ *     res[7:0] = 0x80; OV = 1;
+ *   }
+ *   Rd.B[x] = res[7:0];
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA8(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.59.1. KSLRA8 ===== */
+
+/* ===== Inline Function Start for 3.59.2. KSLRA8.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief KSLRA8.u (SIMD 8-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA8 Rd, Rs1, Rs2
+ * KSLRA8.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q7 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means
+ * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be
+ * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`.
+ * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1]. For the `.u` form
+ * of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[3:0] < 0) {
+ *   sa = -Rs2[3:0];
+ *   sa = (sa == 8)? 7 : sa;
+ *   if (`.u` form) {
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else {
+ *     Rd.B[x] = SE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[2:0];
+ *   res[(7+sa):0] = Rs1.B[x] <<(logic) sa;
+ *   if (res > (2^7)-1) {
+ *     res[7:0] = 0x7f; OV = 1;
+ *   } else if (res < -2^7) {
+ *     res[7:0] = 0x80; OV = 1;
+ *   }
+ *   Rd.B[x] = res[7:0];
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA8_U(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.59.2. KSLRA8.u ===== */
+
+/* ===== Inline Function Start for 3.60.1. KSLRA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief KSLRA16 (SIMD 16-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA16 Rd, Rs1, Rs2
+ * KSLRA16.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q15 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means
+ * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be
+ * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`.
+ * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1]. For the `.u`
+ * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[4:0] < 0) {
+ *   sa = -Rs2[4:0];
+ *   sa = (sa == 16)? 15 : sa;
+ *   if (`.u` form) {
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else {
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[3:0];
+ *   res[(15+sa):0] = Rs1.H[x] <<(logic) sa;
+ *   if (res > (2^15)-1) {
+ *     res[15:0] = 0x7fff; OV = 1;
+ *   } else if (res < -2^15) {
+ *     res[15:0] = 0x8000; OV = 1;
+ *   }
+ *   d.H[x] = res[15:0];
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA16(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.60.1. KSLRA16 ===== */
+
+/* ===== Inline Function Start for 3.60.2. KSLRA16.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief KSLRA16.u (SIMD 16-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA16 Rd, Rs1, Rs2
+ * KSLRA16.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q15 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means
+ * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be
+ * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`.
+ * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1]. For the `.u`
+ * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[4:0] < 0) {
+ *   sa = -Rs2[4:0];
+ *   sa = (sa == 16)? 15 : sa;
+ *   if (`.u` form) {
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else {
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[3:0];
+ *   res[(15+sa):0] = Rs1.H[x] <<(logic) sa;
+ *   if (res > (2^15)-1) {
+ *     res[15:0] = 0x7fff; OV = 1;
+ *   } else if (res < -2^15) {
+ *     res[15:0] = 0x8000; OV = 1;
+ *   }
+ *   d.H[x] = res[15:0];
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA16_U(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.60.2. KSLRA16.u ===== */
+
+/* ===== Inline Function Start for 3.61. KSLRAW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KSLRAW (Shift Left Logical with Q31 Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRAW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a logical left (positive) or arithmetic right (negative) shift operation with Q31
+ * saturation for the left shift on a 32-bit data.
+ *
+ * **Description**:\n
+ * The lower 32-bit content of Rs1 is left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
+ * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[5:0] clamped to the actual shift range of [0, 31].
+ * The left-shifted result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. After the shift
+ * operation, the final result is bit-31 sign-extended and written to Rd. If any saturation happens, this
+ * instruction sets the OV flag. The value of Rs2[31:6] will not affected the operation of this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[5:0] < 0) {
+ *   sa = -Rs2[5:0];
+ *   sa = (sa == 32)? 31 : sa;
+ *   res[31:0] = Rs1.W[0] >>(arith) sa;
+ * } else {
+ *   sa = Rs2[5:0];
+ *   tmp = Rs1.W[0] <<(logic) sa;
+ *   if (tmp > (2^31)-1) {
+ *     res[31:0] = (2^31)-1;
+ *     OV = 1;
+ *   } else if (tmp < -2^31) {
+ *     res[31:0] = -2^31;
+ *     OV = 1
+ *   } else {
+ *     res[31:0] = tmp[31:0];
+ *   }
+ * }
+ * Rd = res[31:0]; // RV32
+ * Rd = SE64(res[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KSLRAW(int a, int b)
+{
+    register long result;
+    __ASM volatile("kslraw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.61. KSLRAW ===== */
+
+/* ===== Inline Function Start for 3.62. KSLRAW.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KSLRAW.u (Shift Left Logical with Q31 Saturation or Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRAW.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a logical left (positive) or arithmetic right (negative) shift operation with Q31
+ * saturation for the left shift and a rounding up operation for the right shift on a 32-bit data.
+ *
+ * **Description**:\n
+ * The lower 32-bit content of Rs1 is left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
+ * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[5:0] clamped to the actual shift range of [0, 31].
+ * The left-shifted result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. The right-shifted
+ * result is added a 1 to the most significant discarded bit position for rounding effect. After the shift,
+ * saturation, or rounding, the final result is bit-31 sign-extended and written to Rd. If any saturation
+ * happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect the operation of this
+ * instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[5:0] < 0) {
+ *   sa = -Rs2[5:0];
+ *   sa = (sa == 32)? 31 : sa;
+ *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
+ *   rst[31:0] = res[31:0];
+ * } else {
+ *   sa = Rs2[5:0];
+ *   tmp = Rs1.W[0] <<(logic) sa;
+ *   if (tmp > (2^31)-1) {
+ *     rst[31:0] = (2^31)-1;
+ *     OV = 1;
+ *   } else if (tmp < -2^31) {
+ *     rst[31:0] = -2^31;
+ *     OV = 1
+ *   } else {
+ *     rst[31:0] = tmp[31:0];
+ *   }
+ * }
+ * Rd = rst[31:0]; // RV32
+ * Rd = SE64(rst[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KSLRAW_U(int a, int b)
+{
+    register long result;
+    __ASM volatile("kslraw.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.62. KSLRAW.u ===== */
+
+/* ===== Inline Function Start for 3.63. KSTAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KSTAS16 (SIMD 16-bit Signed Saturating Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSTAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating addition and 16-bit signed integer element
+ * saturating subtraction in a 32-bit chunk simultaneously. Operands are from corresponding
+ * positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2; at the same time, it
+ * subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit signed
+ * integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number
+ * range (-2^15 <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit chunks in Rd for
+ * subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] + Rs2.W[x][31:16];
+ * res2 = Rs1.W[x][15:0] - Rs2.W[x][15:0];
+ * for (res in [res1, res2]) {
+ *   if (res > (2^15)-1) {
+ *     res = (2^15)-1;
+ *     OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = -2^15;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSTAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.63. KSTAS16 ===== */
+
+/* ===== Inline Function Start for 3.64. KSTSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KSTSA16 (SIMD 16-bit Signed Saturating Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSTSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating subtraction and 16-bit signed integer element
+ * saturating addition in a 32-bit chunk simultaneously. Operands are from corresponding positions in
+ * 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks
+ * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1; at the same time, it
+ * adds the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2 with the 16-bit signed integer
+ * element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the Q15 number range (-2^15
+ * <= Q15 <= 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
+ * written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of 32-bit chunks in Rd for
+ * addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] - Rs2.W[x][31:16];
+ * res2 = Rs1.W[x][15:0] + Rs2.W[x][15:0];
+ * for (res in [res1, res2]) {
+ *   if (res > (2^15)-1) {
+ *     res = (2^15)-1;
+ *     OV = 1;
+ *   } else if (res < -2^15) {
+ *     res = -2^15;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSTSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.64. KSTSA16 ===== */
+
+/* ===== Inline Function Start for 3.65. KSUB8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief KSUB8 (SIMD 8-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUB8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit
+ * signed integer elements in Rs1. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 27
+ * -1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] - Rs2.B[x];
+ * if (res[x] > (2^7)-1) {
+ *   res[x] = (2^7)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^7) {
+ *   res[x] = -2^7;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSUB8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.65. KSUB8 ===== */
+
+/* ===== Inline Function Start for 3.66. KSUB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief KSUB16 (SIMD 16-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit
+ * signed integer elements in Rs1. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <=
+ * 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] - Rs2.H[x];
+ * if (res[x] > (2^15)-1) {
+ *   res[x] = (2^15)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^15) {
+ *   res[x] = -2^15;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSUB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.66. KSUB16 ===== */
+
+/* ===== Inline Function Start for 3.67. KSUB64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief KSUB64 (64-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUB64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 64-bit signed integer subtraction. The result is saturated to the Q63 range.
+ *
+ * **RV32 Description**:\n
+ * This instruction subtracts the 64-bit signed integer of an even/odd pair of
+ * registers specified by Rs2(4,1) from the 64-bit signed integer of an even/odd pair of registers
+ * specified by Rs1(4,1). If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is
+ * saturated to the range and the OV bit is set to 1. The saturated result is then written to an even/odd
+ * pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * This instruction subtracts the 64-bit signed integer of Rs2 from the 64-bit signed
+ * integer of Rs1. If the 64-bit result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated
+ * to the range and the OV bit is set to 1. The saturated result is then written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ * result = R[a_H].R[a_L] - R[b_H].R[b_L];
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * RV64:
+ * result = Rs1 - Rs2;
+ * if (result > (2^63)-1) {
+ *   result = (2^63)-1; OV = 1;
+ * } else if (result < -2^63) {
+ *   result = -2^63; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    long long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_KSUB64(long long a, long long b)
+{
+    register long long result;
+    __ASM volatile("ksub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.67. KSUB64 ===== */
+
+/* ===== Inline Function Start for 3.68. KSUBH ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief KSUBH (Signed Subtraction with Q15 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUBH Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract the signed lower 32-bit content of two registers with Q15 saturation.
+ *
+ * **Description**:\n
+ * The signed lower 32-bit content of Rs2 is subtracted from the signed lower 32-bit
+ * content of Rs1. And the result is saturated to the 16-bit signed integer range of [-2^15, 2^15-1] and then
+ * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] - Rs2.W[0];
+ * if (tmp > (2^15)-1) {
+ *   res = (2^15)-1;
+ *   OV = 1;
+ * } else if (tmp < -2^15) {
+ *   res = -2^15;
+ *   OV = 1
+ * } else {
+ *   res = tmp;
+ * }
+ * Rd = SE(res[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KSUBH(int a, int b)
+{
+    register long result;
+    __ASM volatile("ksubh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.68. KSUBH ===== */
+
+/* ===== Inline Function Start for 3.69. KSUBW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief KSUBW (Signed Subtraction with Q31 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUBW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract the signed lower 32-bit content of two registers with Q31 saturation.
+ *
+ * **Description**:\n
+ * The signed lower 32-bit content of Rs2 is subtracted from the signed lower 32-bit
+ * content of Rs1. And the result is saturated to the 32-bit signed integer range of [-2^31, 2^31-1] and then
+ * sign-extened and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] - Rs2.W[0];
+ * if (tmp > (2^31)-1) {
+ *   res = (2^31)-1;
+ *   OV = 1;
+ * } else if (tmp < -2^31) {
+ * res = -2^31;
+ *   OV = 1
+ * } else {
+ *   res = tmp;
+ * }
+ * Rd = res[31:0]; // RV32
+ * Rd = SE(res[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KSUBW(int a, int b)
+{
+    register long result;
+    __ASM volatile("ksubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.69. KSUBW ===== */
+
+/* ===== Inline Function Start for 3.70.1. KWMMUL ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KWMMUL (SIMD Saturating MSW Signed Multiply Word & Double)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KWMMUL Rd, Rs1, Rs2
+ * KWMMUL.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers, shift the results left 1-bit,
+ * saturate, and write the most significant 32-bit results to a register. The `.u` form additionally
+ * rounds up the multiplication results from the most signification discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2. It then shifts
+ * the multiplication results one bit to the left and takes the most significant 32-bit results. If the
+ * shifted result is greater than 2^31-1, it is saturated to 2^31-1 and the OV flag is set to 1. The final element
+ * result is written to Rd. The 32-bit elements of Rs1 and Rs2 are treated as signed integers. The `.u`
+ * form of the instruction additionally rounds up the 64-bit multiplication results by adding a 1 to bit
+ * 30 before the shift and saturation operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((0x80000000 != Rs1.W[x]) | (0x80000000 != Rs2.W[x])) {
+ *   Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ *   if (`.u` form) {
+ *     Round[x][33:0] = Mres[x][63:30] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][62:31];
+ *   }
+ * } else {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KWMMUL(long a, long b)
+{
+    register long result;
+    __ASM volatile("kwmmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.70.1. KWMMUL ===== */
+
+/* ===== Inline Function Start for 3.70.2. KWMMUL.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief KWMMUL.u (SIMD Saturating MSW Signed Multiply Word & Double with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KWMMUL Rd, Rs1, Rs2
+ * KWMMUL.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of two registers, shift the results left 1-bit,
+ * saturate, and write the most significant 32-bit results to a register. The `.u` form additionally
+ * rounds up the multiplication results from the most signification discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2. It then shifts
+ * the multiplication results one bit to the left and takes the most significant 32-bit results. If the
+ * shifted result is greater than 2^31-1, it is saturated to 2^31-1 and the OV flag is set to 1. The final element
+ * result is written to Rd. The 32-bit elements of Rs1 and Rs2 are treated as signed integers. The `.u`
+ * form of the instruction additionally rounds up the 64-bit multiplication results by adding a 1 to bit
+ * 30 before the shift and saturation operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((0x80000000 != Rs1.W[x]) | (0x80000000 != Rs2.W[x])) {
+ *   Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ *   if (`.u` form) {
+ *     Round[x][33:0] = Mres[x][63:30] + 1;
+ *     Rd.W[x] = Round[x][32:1];
+ *   } else {
+ *     Rd.W[x] = Mres[x][62:31];
+ *   }
+ * } else {
+ *   Rd.W[x] = 0x7fffffff;
+ *   OV = 1;
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KWMMUL_U(long a, long b)
+{
+    register long result;
+    __ASM volatile("kwmmul.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.70.2. KWMMUL.u ===== */
+
+/* ===== Inline Function Start for 3.71. MADDR32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief MADDR32 (Multiply and Add to 32-Bit Word)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MADDR32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit contents of two registers and add the lower 32-bit multiplication result
+ * to the 32-bit content of a destination register. Write the final result back to the destination register.
+ *
+ * **Description**:\n
+ * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2. It adds the
+ * lower 32-bit multiplication result to the lower 32-bit content of Rd and writes the final result (RV32)
+ * or sign-extended result (RV64) back to Rd. The contents of Rs1 and Rs2 can be either signed or
+ * unsigned integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mresult = Rs1 * Rs2;
+ * Rd = Rd + Mresult.W[0];
+ * RV64:
+ * Mresult = Rs1.W[0] * Rs2.W[0];
+ * tres[31:0] = Rd.W[0] + Mresult.W[0];
+ * Rd = SE64(tres[31:0]);
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_MADDR32(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("maddr32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.71. MADDR32 ===== */
+
+/* ===== Inline Function Start for 3.72. MAXW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief MAXW (32-bit Signed Word Maximum)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MAXW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the larger value from the 32-bit contents of two general registers.
+ *
+ * **Description**:\n
+ * This instruction compares two signed 32-bit integers stored in Rs1 and Rs2, picks the
+ * larger value as the result, and writes the result to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs1.W[0] >= Rs2.W[0]) {
+ *   Rd = SE(Rs1.W[0]);
+ * } else {
+ *   Rd = SE(Rs2.W[0]);
+ * }
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_MAXW(int a, int b)
+{
+    register long result;
+    __ASM volatile("maxw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.72. MAXW ===== */
+
+/* ===== Inline Function Start for 3.73. MINW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief MINW (32-bit Signed Word Minimum)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MINW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the smaller value from the 32-bit contents of two general registers.
+ *
+ * **Description**:\n
+ * This instruction compares two signed 32-bit integers stored in Rs1 and Rs2, picks the
+ * smaller value as the result, and writes the result to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs1.W[0] >= Rs2.W[0]) { Rd = SE(Rs2.W[0]); } else { Rd = SE(Rs1.W[0]); }
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_MINW(int a, int b)
+{
+    register long result;
+    __ASM volatile("minw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.73. MINW ===== */
+
+/* ===== Inline Function Start for 3.74. MSUBR32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief MSUBR32 (Multiply and Subtract from 32-Bit Word)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MSUBR32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit contents of two registers and subtract the lower 32-bit multiplication
+ * result from the 32-bit content of a destination register. Write the final result back to the destination
+ * register.
+ *
+ * **Description**:\n
+ * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2, subtracts
+ * the lower 32-bit multiplication result from the lower 32-bit content of Rd, then writes the final
+ * result (RV32) or sign-extended result (RV64) back to Rd. The contents of Rs1 and Rs2 can be either
+ * signed or unsigned integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mresult = Rs1 * Rs2;
+ * Rd = Rd - Mresult.W[0];
+ * RV64:
+ * Mresult = Rs1.W[0] * Rs2.W[0];
+ * tres[31:0] = Rd.W[0] - Mresult.W[0];
+ * Rd = SE64(tres[31:0]);
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_MSUBR32(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("msubr32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.74. MSUBR32 ===== */
+
+/* ===== Inline Function Start for 3.75. MULR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief MULR64 (Multiply Word Unsigned to 64-bit Data)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MULR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit unsigned integer contents of two registers and write the 64-bit result.
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit content of Rs1 with that of Rs2 and writes the 64-bit
+ * multiplication result to an even/odd pair of registers containing Rd. Rd(4,1) index d determines the
+ * even/odd pair group of the two registers. Specifically, the register pair includes register 2d and
+ * 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ * The lower 32-bit contents of Rs1 and Rs2 are treated as unsigned integers.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the lower 32-bit content of Rs1 with that of Rs2 and writes the 64-bit
+ * multiplication result to Rd.
+ * The lower 32-bit contents of Rs1 and Rs2 are treated as unsigned integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mresult = CONCAT(1`b0,Rs1) u* CONCAT(1`b0,Rs2);
+ * R[Rd(4,1).1(0)][31:0] = Mresult[63:32];
+ * R[Rd(4,1).0(0)][31:0] = Mresult[31:0];
+ * RV64:
+ * Rd = Mresult[63:0];
+ * Mresult = CONCAT(1`b0,Rs1.W[0]) u* CONCAT(1`b0,Rs2.W[0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_MULR64(unsigned long a, unsigned long b)
+{
+    register unsigned long long result;
+    __ASM volatile("mulr64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.75. MULR64 ===== */
+
+/* ===== Inline Function Start for 3.76. MULSR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief MULSR64 (Multiply Word Signed to 64-bit Data)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * MULSR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed integer contents of two registers and write the 64-bit result.
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the lower 32-bit content of Rs1 with the lower 32-bit content of Rs2 and
+ * writes the 64-bit multiplication result to an even/odd pair of registers containing Rd. Rd(4,1) index d
+ * determines the even/odd pair group of the two registers. Specifically, the register pair includes
+ * register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ * The lower 32-bit contents of Rs1 and Rs2 are treated as signed integers.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the lower 32-bit content of Rs1 with the lower 32-bit content of Rs2 and
+ * writes the 64-bit multiplication result to Rd.
+ * The lower 32-bit contents of Rs1 and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mresult = Ra s* Rb;
+ * R[Rd(4,1).1(0)][31:0] = Mresult[63:32];
+ * R[Rd(4,1).0(0)][31:0] = Mresult[31:0];
+ * RV64:
+ * Mresult = Ra.W[0] s* Rb.W[0];
+ * Rd = Mresult[63:0];
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_MULSR64(long a, long b)
+{
+    register long long result;
+    __ASM volatile("mulsr64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.76. MULSR64 ===== */
+
+/* ===== Inline Function Start for 3.77. PBSAD ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief PBSAD (Parallel Byte Sum of Absolute Difference)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PBSAD Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Calculate the sum of absolute difference of unsigned 8-bit data elements.
+ *
+ * **Description**:\n
+ * This instruction subtracts the un-signed 8-bit elements of Rs2 from those of Rs1. Then
+ * it adds the absolute value of each difference together and writes the result to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * absdiff[x] = ABS(Rs1.B[x] - Rs2.B[x]);
+ * Rd = SUM(absdiff[x]);
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PBSAD(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pbsad %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.77. PBSAD ===== */
+
+/* ===== Inline Function Start for 3.78. PBSADA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief PBSADA (Parallel Byte Sum of Absolute Difference Accum)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PBSADA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Calculate the sum of absolute difference of four unsigned 8-bit data elements and
+ * accumulate it into a register.
+ *
+ * **Description**:\n
+ * This instruction subtracts the un-signed 8-bit elements of Rs2 from those of Rs1. It
+ * then adds the absolute value of each difference together along with the content of Rd and writes the
+ * accumulated result back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * absdiff[x] = ABS(Rs1.B[x] - Rs2.B[x]);
+ * Rd = Rd + SUM(absdiff[x]);
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PBSADA(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("pbsada %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.78. PBSADA ===== */
+
+/* ===== Inline Function Start for 3.79.1. PKBB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
+ * \brief PKBB16 (Pack Two 16-bit Data from Both Bottom Half)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB16 Rd, Rs1, Rs2
+ * PKBT16 Rd, Rs1, Rs2
+ * PKTT16 Rd, Rs1, Rs2
+ * PKTB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 16-bit data from 32-bit chunks in two registers.
+ * * PKBB16: bottom.bottom
+ * * PKBT16 bottom.top
+ * * PKTT16 top.top
+ * * PKTB16 top.bottom
+ *
+ * **Description**:\n
+ * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
+ * Rd.W[x] [15:0].
+ * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKBB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pkbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.79.1. PKBB16 ===== */
+
+/* ===== Inline Function Start for 3.79.2. PKBT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
+ * \brief PKBT16 (Pack Two 16-bit Data from Bottom and Top Half)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB16 Rd, Rs1, Rs2
+ * PKBT16 Rd, Rs1, Rs2
+ * PKTT16 Rd, Rs1, Rs2
+ * PKTB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 16-bit data from 32-bit chunks in two registers.
+ * * PKBB16: bottom.bottom
+ * * PKBT16 bottom.top
+ * * PKTT16 top.top
+ * * PKTB16 top.bottom
+ *
+ * **Description**:\n
+ * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
+ * Rd.W[x] [15:0].
+ * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKBT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pkbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.79.2. PKBT16 ===== */
+
+/* ===== Inline Function Start for 3.79.3. PKTT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
+ * \brief PKTT16 (Pack Two 16-bit Data from Both Top Half)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB16 Rd, Rs1, Rs2
+ * PKBT16 Rd, Rs1, Rs2
+ * PKTT16 Rd, Rs1, Rs2
+ * PKTB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 16-bit data from 32-bit chunks in two registers.
+ * * PKBB16: bottom.bottom
+ * * PKBT16 bottom.top
+ * * PKTT16 top.top
+ * * PKTB16 top.bottom
+ *
+ * **Description**:\n
+ * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
+ * Rd.W[x] [15:0].
+ * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKTT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pktt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.79.3. PKTT16 ===== */
+
+/* ===== Inline Function Start for 3.79.4. PKTB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_PACK
+ * \brief PKTB16 (Pack Two 16-bit Data from Top and Bottom Half)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB16 Rd, Rs1, Rs2
+ * PKBT16 Rd, Rs1, Rs2
+ * PKTT16 Rd, Rs1, Rs2
+ * PKTB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 16-bit data from 32-bit chunks in two registers.
+ * * PKBB16: bottom.bottom
+ * * PKBT16 bottom.top
+ * * PKTT16 top.top
+ * * PKTB16 top.bottom
+ *
+ * **Description**:\n
+ * (PKBB16) moves Rs1.W[x][15:0] to Rd.W[x][31:16] and moves Rs2.W[x] [15:0] to
+ * Rd.W[x] [15:0].
+ * (PKBT16) moves Rs1.W[x] [15:0] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTT16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [31:16] to Rd.W[x] [15:0].
+ * (PKTB16) moves Rs1.W[x] [31:16] to Rd.W[x] [31:16] and moves Rs2.W[x] [15:0] to Rd.W[x] [15:0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][15:0]); // PKBB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][15:0], Rs2.W[x][31:16]); // PKBT16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][15:0]); // PKTB16
+ * Rd.W[x][31:0] = CONCAT(Rs1.W[x][31:16], Rs2.W[x][31:16]); // PKTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKTB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pktb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.79.4. PKTB16 ===== */
+
+/* ===== Inline Function Start for 3.80. RADD8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief RADD8 (SIMD 8-bit Signed Halving Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RADD8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer element additions simultaneously. The element results are halved
+ * to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed
+ * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to
+ * Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7F, Rs2 = 0x7F, Rd = 0x7F
+ * * Rs1 = 0x80, Rs2 = 0x80, Rd = 0x80
+ * * Rs1 = 0x40, Rs2 = 0x80, Rd = 0xE0
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] + Rs2.B[x]) s>> 1; for RV32: x=3...0, for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RADD8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("radd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.80. RADD8 ===== */
+
+/* ===== Inline Function Start for 3.81. RADD16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RADD16 (SIMD 16-bit Signed Halving Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RADD16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element additions simultaneously. The results are halved to avoid
+ * overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed
+ * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to
+ * Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7FFF, Rs2 = 0x7FFF, Rd = 0x7FFF
+ * * Rs1 = 0x8000, Rs2 = 0x8000, Rd = 0x8000
+ * * Rs1 = 0x4000, Rs2 = 0x8000, Rd = 0xE000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] + Rs2.H[x]) s>> 1; for RV32: x=1...0, for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RADD16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("radd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.81. RADD16 ===== */
+
+/* ===== Inline Function Start for 3.82. RADD64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief RADD64 (64-bit Signed Halving Addition)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RADD64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add two 64-bit signed integers. The result is halved to avoid overflow or saturation.
+ *
+ * **RV32 Description**:\n
+ * This instruction adds the 64-bit signed integer of an even/odd pair of registers
+ * specified by Rs1(4,1) with the 64-bit signed integer of an even/odd pair of registers specified by
+ * Rs2(4,1). The 64-bit addition result is first arithmetically right-shifted by 1 bit and then written to an
+ * even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction adds the 64-bit signed integer in Rs1 with the 64-bit signed
+ * integer in Rs2. The 64-bit addition result is first arithmetically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ * R[t_H].R[t_L] = (R[a_H].R[a_L] + R[b_H].R[b_L]) s>> 1;
+ * RV64:
+ * Rd = (Rs1 + Rs2) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    long long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_RADD64(long long a, long long b)
+{
+    register long long result;
+    __ASM volatile("radd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.82. RADD64 ===== */
+
+/* ===== Inline Function Start for 3.83. RADDW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief RADDW (32-bit Signed Halving Addition)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RADDW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add 32-bit signed integers and the results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the first 32-bit signed integer in Rs1 with the first 32-bit signed
+ * integer in Rs2. The result is first arithmetically right-shifted by 1 bit and then sign-extended and
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7FFFFFFF, Rs2 = 0x7FFFFFFF, Rd = 0x7FFFFFFF
+ * * Rs1 = 0x80000000, Rs2 = 0x80000000, Rd = 0x80000000
+ * * Rs1 = 0x40000000, Rs2 = 0x80000000, Rd = 0xE0000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Rd[31:0] = (Rs1[31:0] + Rs2[31:0]) s>> 1;
+ * RV64:
+ * resw[31:0] = (Rs1[31:0] + Rs2[31:0]) s>> 1;
+ * Rd[63:0] = SE(resw[31:0]);
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_RADDW(int a, int b)
+{
+    register long result;
+    __ASM volatile("raddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.83. RADDW ===== */
+
+/* ===== Inline Function Start for 3.84. RCRAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RCRAS16 (SIMD 16-bit Signed Halving Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RCRAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element addition and 16-bit signed integer element subtraction in
+ * a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks. The results
+ * are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit signed integer element in [15:0] of 32-bit chunks in Rs2, and subtracts the 16-bit
+ * signed integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit signed integer element in
+ * [15:0] of 32-bit chunks in Rs1. The element results are first arithmetically right-shifted by 1 bit and
+ * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD16` and `RSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][15:0]) s>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][31:16]) s>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RCRAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.84. RCRAS16 ===== */
+
+/* ===== Inline Function Start for 3.85. RCRSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RCRSA16 (SIMD 16-bit Signed Halving Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RCRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element subtraction and 16-bit signed integer element addition in
+ * a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks. The results
+ * are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer element in [15:0] of 32-bit chunks
+ * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit
+ * signed element integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit signed integer element in
+ * [31:16] of 32-bit chunks in Rs2. The two results are first arithmetically right-shifted by 1 bit and
+ * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD16` and `RSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][15:0]) s>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][31:16]) s>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RCRSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.85. RCRSA16 ===== */
+
+/* ===== Inline Function Start for 3.86. RDOV ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_OV_FLAG_SC
+ * \brief RDOV (Read OV flag)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RDOV Rd  # pseudo mnemonic
+ * ~~~
+ *
+ * **Purpose**:\n
+ * This pseudo instruction is an alias to `CSRR Rd, ucode` instruction which maps to the real
+ * instruction of `CSRRS Rd, ucode, x0`.
+ *
+ *
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RDOV(void)
+{
+    register unsigned long result;
+    __ASM volatile("rdov %0" : "=r"(result));
+    return result;
+}
+/* ===== Inline Function End for 3.86. RDOV ===== */
+
+/* ===== Inline Function Start for 3.87. RSTAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RSTAS16 (SIMD 16-bit Signed Halving Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSTAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element addition and 16-bit signed integer element subtraction in
+ * a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit chunks. The
+ * results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs2, and subtracts the 16-bit
+ * signed integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit signed integer element in
+ * [15:0] of 32-bit chunks in Rs1. The element results are first arithmetically right-shifted by 1 bit and
+ * then written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD16` and `RSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][31:16]) s>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][15:0]) s>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSTAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.87. RSTAS16 ===== */
+
+/* ===== Inline Function Start for 3.88. RSTSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RSTSA16 (SIMD 16-bit Signed Halving Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSTSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element subtraction and 16-bit signed integer element addition in
+ * a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit chunks. The
+ * results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer element in [31:16] of 32-bit chunks
+ * in Rs2 from the 16-bit signed integer element in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit
+ * signed element integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit signed integer element in
+ * [15:0] of 32-bit chunks in Rs2. The two results are first arithmetically right-shifted by 1 bit and then
+ * written to [31:16] of 32-bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD16` and `RSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][31:16]) s>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][15:0]) s>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSTSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.88. RSTSA16 ===== */
+
+/* ===== Inline Function Start for 3.89. RSUB8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief RSUB8 (SIMD 8-bit Signed Halving Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSUB8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit
+ * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7F, Rs2 = 0x80, Rd = 0x7F
+ * * Rs1 = 0x80, Rs2 = 0x7F, Rd = 0x80
+ * * Rs1= 0x80, Rs2 = 0x40, Rd = 0xA0
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] - Rs2.B[x]) s>> 1;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSUB8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rsub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.89. RSUB8 ===== */
+
+/* ===== Inline Function Start for 3.90. RSUB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief RSUB16 (SIMD 16-bit Signed Halving Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSUB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit
+ * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFF, Rb = 0x8000, Rt = 0x7FFF
+ * * Ra = 0x8000, Rb = 0x7FFF, Rt = 0x8000
+ * * Ra = 0x8000, Rb = 0x4000, Rt = 0xA000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] - Rs2.H[x]) s>> 1;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSUB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rsub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.90. RSUB16 ===== */
+
+/* ===== Inline Function Start for 3.91. RSUB64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief RSUB64 (64-bit Signed Halving Subtraction)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSUB64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 64-bit signed integer subtraction. The result is halved to avoid overflow or
+ * saturation.
+ *
+ * **RV32 Description**:\n
+ * This instruction subtracts the 64-bit signed integer of an even/odd pair of
+ * registers specified by Rb(4,1) from the 64-bit signed integer of an even/odd pair of registers
+ * specified by Ra(4,1). The subtraction result is first arithmetically right-shifted by 1 bit and then
+ * written to an even/odd pair of registers specified by Rt(4,1).
+ * Rx(4,1), i.e., value d, determines the even/odd pair group of two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction subtracts the 64-bit signed integer in Rs2 from the 64-bit signed
+ * integer in Rs1. The 64-bit subtraction result is first arithmetically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ * R[t_H].R[t_L] = (R[a_H].R[a_L] - R[b_H].R[b_L]) s>> 1;
+ * RV64:
+ * Rd = (Rs1 - Rs2) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    long long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_RSUB64(long long a, long long b)
+{
+    register long long result;
+    __ASM volatile("rsub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.91. RSUB64 ===== */
+
+/* ===== Inline Function Start for 3.92. RSUBW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief RSUBW (32-bit Signed Halving Subtraction)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSUBW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract 32-bit signed integers and the result is halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the first 32-bit signed integer in Rs2 from the first 32-bit
+ * signed integer in Rs1. The result is first arithmetically right-shifted by 1 bit and then sign-extended
+ * and written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7FFFFFFF, Rs2 = 0x80000000, Rd = 0x7FFFFFFF
+ * * Rs1 = 0x80000000, Rs2 = 0x7FFFFFFF, Rd = 0x80000000
+ * * Rs1 = 0x80000000, Rs2 = 0x40000000, Rd = 0xA0000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Rd[31:0] = (Rs1[31:0] - Rs2[31:0]) s>> 1;
+ * RV64:
+ * resw[31:0] = (Rs1[31:0] - Rs2[31:0]) s>> 1;
+ * Rd[63:0] = SE(resw[31:0]);
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_RSUBW(int a, int b)
+{
+    register long result;
+    __ASM volatile("rsubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.92. RSUBW ===== */
+
+/* ===== Inline Function Start for 3.93. SCLIP8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief SCLIP8 (SIMD 8-bit Signed Clip Value)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCLIP8 Rd, Rs1, imm3u[2:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 8-bit signed integer elements of a register into a signed range simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 8-bit signed integer elements stored in Rs1 into a signed
+ * integer range between 2^imm3u-1 and -2^imm3u, and writes the limited results to Rd. For example, if
+ * imm3u is 3, the 8-bit input values should be saturated between 7 and -8. If saturation is performed,
+ * set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.B[x];
+ * if (src > (2^imm3u)-1) {
+ *   src = (2^imm3u)-1;
+ *   OV = 1;
+ * } else if (src < -2^imm3u) {
+ *   src = -2^imm3u;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = src
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SCLIP8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("sclip8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.93. SCLIP8 ===== */
+
+/* ===== Inline Function Start for 3.94. SCLIP16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief SCLIP16 (SIMD 16-bit Signed Clip Value)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCLIP16 Rd, Rs1, imm4u[3:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 16-bit signed integer elements of a register into a signed range simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 16-bit signed integer elements stored in Rs1 into a signed
+ * integer range between 2imm4u-1 and -2imm4u, and writes the limited results to Rd. For example, if
+ * imm4u is 3, the 16-bit input values should be saturated between 7 and -8. If saturation is performed,
+ * set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.H[x];
+ * if (src > (2^imm4u)-1) {
+ *   src = (2^imm4u)-1;
+ *   OV = 1;
+ * } else if (src < -2^imm4u) {
+ *   src = -2^imm4u;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = src
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SCLIP16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("sclip16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.94. SCLIP16 ===== */
+
+/* ===== Inline Function Start for 3.95. SCLIP32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief SCLIP32 (SIMD 32-bit Signed Clip Value)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCLIP32 Rd, Rs1, imm5u[4:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 32-bit signed integer elements of a register into a signed range simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 32-bit signed integer elements stored in Rs1 into a signed
+ * integer range between 2imm5u-1 and -2imm5u, and writes the limited results to Rd. For example, if
+ * imm5u is 3, the 32-bit input values should be saturated between 7 and -8. If saturation is performed,
+ * set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.W[x];
+ * if (src > (2^imm5u)-1) {
+ *   src = (2^imm5u)-1;
+ *   OV = 1;
+ * } else if (src < -2^imm5u) {
+ *   src = -2^imm5u;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = src
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+#define __RV_SCLIP32(a, b)    \
+    ({    \
+        register long result;    \
+        register long __a = (long)(a);    \
+        __ASM volatile("sclip32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.95. SCLIP32 ===== */
+
+/* ===== Inline Function Start for 3.96. SCMPLE8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
+ * \brief SCMPLE8 (SIMD 8-bit Signed Compare Less Than & Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCMPLE8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer elements less than & equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
+ * signed integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it is
+ * true, the result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to
+ * Rd
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] {le} Rs2.B[x])? 0xff : 0x0;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SCMPLE8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("scmple8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.96. SCMPLE8 ===== */
+
+/* ===== Inline Function Start for 3.97. SCMPLE16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
+ * \brief SCMPLE16 (SIMD 16-bit Signed Compare Less Than & Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCMPLE16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements less than & equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit
+ * signed integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it is
+ * true, the result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] {le} Rs2.H[x])? 0xffff : 0x0;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SCMPLE16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("scmple16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.97. SCMPLE16 ===== */
+
+/* ===== Inline Function Start for 3.98. SCMPLT8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
+ * \brief SCMPLT8 (SIMD 8-bit Signed Compare Less Than)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCMPLT8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer elements less than comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
+ * signed integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
+ * result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] < Rs2.B[x])? 0xff : 0x0;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SCMPLT8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("scmplt8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.98. SCMPLT8 ===== */
+
+/* ===== Inline Function Start for 3.99. SCMPLT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
+ * \brief SCMPLT16 (SIMD 16-bit Signed Compare Less Than)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SCMPLT16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements less than comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit signed integer elements in Rs1 with the two 16-
+ * bit signed integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
+ * result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] < Rs2.H[x])? 0xffff : 0x0;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SCMPLT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("scmplt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.99. SCMPLT16 ===== */
+
+/* ===== Inline Function Start for 3.100. SLL8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SLL8 (SIMD 8-bit Shift Left Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLL8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left shift operations simultaneously. The shift amount is a
+ * variable from a GPR.
+ *
+ * **Description**:\n
+ * The 8-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
+ * The shifted out bits are filled with zero and the shift amount is specified by the low-order 3-bits of
+ * the value in the Rs2 register.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * Rd.B[x] = Rs1.B[x] << sa;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SLL8(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sll8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.100. SLL8 ===== */
+
+/* ===== Inline Function Start for 3.101. SLLI8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SLLI8 (SIMD 8-bit Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLLI8 Rd, Rs1, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left shift operations simultaneously. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * The 8-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
+ * The shifted out bits are filled with zero and the shift amount is specified by the imm3u constant.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * Rd.B[x] = Rs1.B[x] << sa;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SLLI8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("slli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.101. SLLI8 ===== */
+
+/* ===== Inline Function Start for 3.102. SLL16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SLL16 (SIMD 16-bit Shift Left Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLL16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left shift operations simultaneously. The shift amount is a
+ * variable from a GPR.
+ *
+ * **Description**:\n
+ * The 16-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
+ * The shifted out bits are filled with zero and the shift amount is specified by the low-order 4-bits of
+ * the value in the Rs2 register.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * Rd.H[x] = Rs1.H[x] << sa;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SLL16(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sll16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.102. SLL16 ===== */
+
+/* ===== Inline Function Start for 3.103. SLLI16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SLLI16 (SIMD 16-bit Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLLI16 Rd, Rs1, imm4[3:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit element logical left shift operations simultaneously. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * The 16-bit elements in Rs1 are left-shifted logically. The shifted out bits are filled with
+ * zero and the shift amount is specified by the imm4[3:0] constant. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4[3:0];
+ * Rd.H[x] = Rs1.H[x] << sa;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SLLI16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("slli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.103. SLLI16 ===== */
+
+/* ===== Inline Function Start for 3.104. SMAL ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMAL (Signed Multiply Halfs & Add 64-bit)
+ * \details
+ * **Type**: Partial-SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAL Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed bottom 16-bit content of the 32-bit elements of a register with the top
+ * 16-bit content of the same 32-bit elements of the same register, and add the results with a 64-bit
+ * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
+ * to another even/odd pair of registers (RV32) or a register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the bottom 16-bit content of the lower 32-bit of Rs2 with the top 16-bit
+ * content of the lower 32-bit of Rs2 and adds the result with the 64-bit value of an even/odd pair of
+ * registers specified by Rs1(4,1). The 64-bit addition result is written back to an even/odd pair of
+ * registers specified by Rd(4,1). The 16-bit values of Rs2, and the 64-bit value of the Rs1(4,1) register-
+ * pair are treated as signed integers.
+ * Rx(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the bottom 16-bit content of the 32-bit elements of Rs2 with the top 16-bit
+ * content of the same 32-bit elements of Rs2 and adds the results with the 64-bit value of Rs1. The 64-
+ * bit addition result is written back to Rd. The 16-bit values of Rs2, and the 64-bit value of Rs1 are
+ * treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mres[31:0] = Rs2.H[1] * Rs2.H[0];
+ * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs1(4,1),1'b1); +
+ * Idx2 = CONCAT(Rd(4,1),1'b0); Idx3 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx3].R[Idx2] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * RV64:
+ * Mres[0][31:0] = Rs2.W[0].H[1] * Rs2.W[0].H[0];
+ * Mres[1][31:0] = Rs2.W[1].H[1] * Rs2.W[1].H[0];
+ * Rd = Rs1 + SE64(Mres[1][31:0]) + SE64(Mres[0][31:0]);
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMAL(long long a, unsigned long b)
+{
+    register long long result;
+    __ASM volatile("smal %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.104. SMAL ===== */
+
+/* ===== Inline Function Start for 3.105.1. SMALBB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALBB (Signed Multiply Bottom Halfs & Add 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALBB Rd, Rs1, Rs2
+ * SMALBT Rd, Rs1, Rs2
+ * SMALTT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit
+ * content of the corresponding 32-bit elements of another register and add the results with a 64-bit
+ * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
+ * to the register-pair (RV32) or the register (RV64).
+ * * SMALBB: rt pair + bottom*bottom (all 32-bit elements)
+ * * SMALBT rt pair + bottom*top (all 32-bit elements)
+ * * SMALTT rt pair + top*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2.
+ * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT
+ * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * RV64:
+ * // SMALBB
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0];
+ * // SMALBT
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1];
+ * // SMALTT
+ * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1];
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALBB(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalbb %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.105.1. SMALBB ===== */
+
+/* ===== Inline Function Start for 3.105.2. SMALBT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALBT (Signed Multiply Bottom Half & Top Half & Add 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALBB Rd, Rs1, Rs2
+ * SMALBT Rd, Rs1, Rs2
+ * SMALTT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit
+ * content of the corresponding 32-bit elements of another register and add the results with a 64-bit
+ * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
+ * to the register-pair (RV32) or the register (RV64).
+ * * SMALBB: rt pair + bottom*bottom (all 32-bit elements)
+ * * SMALBT rt pair + bottom*top (all 32-bit elements)
+ * * SMALTT rt pair + top*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2.
+ * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT
+ * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * RV64:
+ * // SMALBB
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0];
+ * // SMALBT
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1];
+ * // SMALTT
+ * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1];
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALBT(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalbt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.105.2. SMALBT ===== */
+
+/* ===== Inline Function Start for 3.105.3. SMALTT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALTT (Signed Multiply Top Halfs & Add 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALBB Rd, Rs1, Rs2
+ * SMALBT Rd, Rs1, Rs2
+ * SMALTT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the 16-bit
+ * content of the corresponding 32-bit elements of another register and add the results with a 64-bit
+ * value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is written back
+ * to the register-pair (RV32) or the register (RV64).
+ * * SMALBB: rt pair + bottom*bottom (all 32-bit elements)
+ * * SMALBT rt pair + bottom*top (all 32-bit elements)
+ * * SMALTT rt pair + top*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2.
+ * The multiplication result is added with the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALBB` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALBT` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMALTT` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are added with the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[0]; // SMALBB
+ * Mres[31:0] = Rs1.H[0] * Rs2.H[1]; // SMALBT
+ * Mres[31:0] = Rs1.H[1] * Rs2.H[1]; // SMALTT
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * RV64:
+ * // SMALBB
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[0];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[0];
+ * // SMALBT
+ * Mres[0][31:0] = Rs1.W[0].H[0] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[0] * Rs2.W[1].H[1];
+ * // SMALTT
+ * Mres[0][31:0] = Rs1.W[0].H[1] * Rs2.W[0].H[1];
+ * Mres[1][31:0] = Rs1.W[1].H[1] * Rs2.W[1].H[1];
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALTT(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smaltt %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.105.3. SMALTT ===== */
+
+/* ===== Inline Function Start for 3.106.1. SMALDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALDA (Signed Multiply Two Halfs and Two Adds 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALDA Rd, Rs1, Rs2
+ * SMALXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * adds the two 32-bit results and the 64-bit value of an even/odd pair of registers together.
+ * * SMALDA: rt pair+ top*top + bottom*bottom (all 32-bit elements)
+ * * SMALXDA: rt pair+ top*bottom + bottom*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then adds the result to the result of multiplying the top 16-bit content of Rs1 with
+ * the top 16-bit content of Rs2 with unlimited precision.
+ * For the `SMALXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of Rs1
+ * with the top 16-bit content of Rs2 with unlimited precision.
+ * The result is added to the 64-bit value of an even/odd pair of registers specified by Rd(4,1). The 64-
+ * bit addition result is written back to the register-pair. The 16-bit values of Rs1 and Rs2, and the 64-
+ * bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
+ * bit elements of Rs2 with unlimited precision.
+ * For the `SMALXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
+ * 32-bit elements of Rs2 with unlimited precision.
+ * The results are added to the 64-bit value of Rd. The 64-bit addition result is written back to Rd. The
+ * 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * // SMALDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
+ * // SMALXDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres0[31:0]) + SE64(Mres1[31:0]);
+ * RV64:
+ * // SMALDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMALXDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
+ * Rd = Rd + SE64(Mres0[0][31:0]) + SE64(Mres1[0][31:0]) + SE64(Mres0[1][31:0]) +
+ * SE64(Mres1[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALDA(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.106.1. SMALDA ===== */
+
+/* ===== Inline Function Start for 3.106.2. SMALXDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALXDA (Signed Crossed Multiply Two Halfs and Two Adds 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALDA Rd, Rs1, Rs2
+ * SMALXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * adds the two 32-bit results and the 64-bit value of an even/odd pair of registers together.
+ * * SMALDA: rt pair+ top*top + bottom*bottom (all 32-bit elements)
+ * * SMALXDA: rt pair+ top*bottom + bottom*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then adds the result to the result of multiplying the top 16-bit content of Rs1 with
+ * the top 16-bit content of Rs2 with unlimited precision.
+ * For the `SMALXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then adds the result to the result of multiplying the bottom 16-bit content of Rs1
+ * with the top 16-bit content of Rs2 with unlimited precision.
+ * The result is added to the 64-bit value of an even/odd pair of registers specified by Rd(4,1). The 64-
+ * bit addition result is written back to the register-pair. The 16-bit values of Rs1 and Rs2, and the 64-
+ * bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the 32-
+ * bit elements of Rs2 with unlimited precision.
+ * For the `SMALXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then adds the result to the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
+ * 32-bit elements of Rs2 with unlimited precision.
+ * The results are added to the 64-bit value of Rd. The 64-bit addition result is written back to Rd. The
+ * 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * RV32:
+ * // SMALDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
+ * // SMALXDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres0[31:0]) + SE64(Mres1[31:0]);
+ * RV64:
+ * // SMALDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMALXDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
+ * Rd = Rd + SE64(Mres0[0][31:0]) + SE64(Mres1[0][31:0]) + SE64(Mres0[1][31:0]) +
+ * SE64(Mres1[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALXDA(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.106.2. SMALXDA ===== */
+
+/* ===== Inline Function Start for 3.107.1. SMALDS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALDS (Signed Multiply Two Halfs & Subtract & Add 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALDS Rd, Rs1, Rs2
+ * SMALDRS Rd, Rs1, Rs2
+ * SMALXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is
+ * written back to the register-pair.
+ * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements)
+ * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements)
+ * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the top 16-bit content of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1
+ * with the bottom 16-bit content of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the bottom 16-bit content of Rs2.
+ * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content
+ * of the 32-bit elements of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS
+ * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * * RV64:
+ * // SMALDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * // SMALDRS
+ * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMALXDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALDS(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.107.1. SMALDS ===== */
+
+/* ===== Inline Function Start for 3.107.2. SMALDRS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALDRS (Signed Multiply Two Halfs & Reverse Subtract & Add 64- bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALDS Rd, Rs1, Rs2
+ * SMALDRS Rd, Rs1, Rs2
+ * SMALXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is
+ * written back to the register-pair.
+ * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements)
+ * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements)
+ * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the top 16-bit content of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1
+ * with the bottom 16-bit content of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the bottom 16-bit content of Rs2.
+ * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content
+ * of the 32-bit elements of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS
+ * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * * RV64:
+ * // SMALDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * // SMALDRS
+ * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMALXDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALDRS(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smaldrs %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.107.2. SMALDRS ===== */
+
+/* ===== Inline Function Start for 3.107.3. SMALXDS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMALXDS (Signed Crossed Multiply Two Halfs & Subtract & Add 64- bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMALDS Rd, Rs1, Rs2
+ * SMALDRS Rd, Rs1, Rs2
+ * SMALXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results. Then add the subtraction result to
+ * the 64-bit value of an even/odd pair of registers (RV32) or a register (RV64). The addition result is
+ * written back to the register-pair.
+ * * SMALDS: rt pair + (top*top - bottom*bottom) (all 32-bit elements)
+ * * SMALDRS: rt pair + (bottom*bottom - top*top) (all 32-bit elements)
+ * * SMALXDS: rt pair + (top*bottom - bottom*top) (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the top 16-bit content of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of Rs1 with the top 16-bit content
+ * of Rs2 and then subtracts the result from the result of multiplying the bottom 16-bit content of Rs1
+ * with the bottom 16-bit content of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of Rs1 with the top 16-bit
+ * content of Rs2 and then subtracts the result from the result of multiplying the top 16-bit content of
+ * Rs1 with the bottom 16-bit content of Rs2.
+ * The subtraction result is then added to the 64-bit value of an even/odd pair of registers specified by
+ * Rd(4,1). The 64-bit addition result is written back to the register-pair. The 16-bit values of Rs1 and
+ * Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * For the `SMALDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content
+ * of the 32-bit elements of Rs2.
+ * For the `SMALDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMALXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction results are then added to the 64-bit value of Rd. The 64-bit addition result is written
+ * back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[1]) - (Rs1.H[0] * Rs2.H[0]); // SMALDS
+ * Mres[31:0] = (Rs1.H[0] * Rs2.H[0]) - (Rs1.H[1] * Rs2.H[1]); // SMALDRS
+ * Mres[31:0] = (Rs1.H[1] * Rs2.H[0]) - (Rs1.H[0] * Rs2.H[1]); // SMALXDS
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] + SE64(Mres[31:0]);
+ * * RV64:
+ * // SMALDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]) - (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[1]) - (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * // SMALDRS
+ * Mres[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]) - (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[0].H[0]) - (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMALXDS
+ * Mres[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]) - (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[0].H[0]) - (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Rd = Rd + SE64(Mres[0][31:0]) + SE64(Mres[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMALXDS(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smalxds %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.107.3. SMALXDS ===== */
+
+/* ===== Inline Function Start for 3.108. SMAR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief SMAR64 (Signed Multiply and Add to 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed elements in two registers and add the 64-bit multiplication
+ * result to the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is written
+ * back to the pair of registers (RV32) or a register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It adds
+ * the 64-bit multiplication result to the 64-bit signed data of an even/odd pair of registers specified by
+ * Rd(4,1). The addition result is written back to the even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
+ * adds the 64-bit multiplication results to the 64-bit signed data of Rd. The addition result is written
+ * back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].R[t_L] = R[t_H].R[t_L] + (Rs1 * Rs2);
+ * * RV64:
+ * Rd = Rd + (Rs1.W[0] * Rs2.W[0]) + (Rs1.W[1] * Rs2.W[1]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMAR64(long long t, long a, long b)
+{
+    __ASM volatile("smar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.108. SMAR64 ===== */
+
+/* ===== Inline Function Start for 3.109. SMAQA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD
+ * \brief SMAQA (Signed Multiply Four Bytes with 32-bit Adds)
+ * \details
+ * **Type**: Partial-SIMD (Reduction)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAQA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do four signed 8-bit multiplications from 32-bit chunks of two registers; and then adds
+ * the four 16-bit results and the content of corresponding 32-bit chunks of a third register together.
+ *
+ * **Description**:\n
+ * This instruction multiplies the four signed 8-bit elements of 32-bit chunks of Rs1 with the four
+ * signed 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the signed
+ * content of the corresponding 32-bit chunks of Rd. The final results are written back to the
+ * corresponding 32-bit chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] +
+ *    (Rs1.W[x].B[3] s* Rs2.W[x].B[3]) + (Rs1.W[x].B[2] s* Rs2.W[x].B[2]) +
+ *    (Rs1.W[x].B[1] s* Rs2.W[x].B[1]) + (Rs1.W[x].B[0] s* Rs2.W[x].B[0]);
+ * Rd.W[x] = res[x];
+ * for RV32: x=0,
+ * for RV64: x=1,0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMAQA(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.109. SMAQA ===== */
+
+/* ===== Inline Function Start for 3.110. SMAQA.SU ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD
+ * \brief SMAQA.SU (Signed and Unsigned Multiply Four Bytes with 32-bit Adds)
+ * \details
+ * **Type**: Partial-SIMD (Reduction)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAQA.SU Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do four `signed x unsigned` 8-bit multiplications from 32-bit chunks of two registers; and
+ * then adds the four 16-bit results and the content of corresponding 32-bit chunks of a third register
+ * together.
+ *
+ * **Description**:\n
+ * This instruction multiplies the four signed 8-bit elements of 32-bit chunks of Rs1 with the four
+ * unsigned 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the
+ * signed content of the corresponding 32-bit chunks of Rd. The final results are written back to the
+ * corresponding 32-bit chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] +
+ *    (Rs1.W[x].B[3] su* Rs2.W[x].B[3]) + (Rs1.W[x].B[2] su* Rs2.W[x].B[2]) +
+ *    (Rs1.W[x].B[1] su* Rs2.W[x].B[1]) + (Rs1.W[x].B[0] su* Rs2.W[x].B[0]);
+ * Rd.W[x] = res[x];
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMAQA_SU(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smaqa.su %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.110. SMAQA.SU ===== */
+
+/* ===== Inline Function Start for 3.111. SMAX8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief SMAX8 (SIMD 8-bit Signed Maximum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
+ * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] > Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMAX8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smax8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.111. SMAX8 ===== */
+
+/* ===== Inline Function Start for 3.112. SMAX16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief SMAX16 (SIMD 16-bit Signed Maximum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit
+ * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] > Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMAX16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smax16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.112. SMAX16 ===== */
+
+/* ===== Inline Function Start for 3.113.1. SMBB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMBB16 (SIMD Signed Multiply Bottom Half & Bottom Half)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB16 Rd, Rs1, Rs2
+ * SMBT16 Rd, Rs1, Rs2
+ * SMTT16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-
+ * bit content of the 32-bit elements of another register and write the result to a third register.
+ * * SMBB16: W[x].bottom*W[x].bottom
+ * * SMBT16: W[x].bottom *W[x].top
+ * * SMTT16: W[x].top * W[x].top
+ *
+ * **Description**:\n
+ * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16
+ * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMBB16(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.113.1. SMBB16 ===== */
+
+/* ===== Inline Function Start for 3.113.2. SMBT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMBT16 (SIMD Signed Multiply Bottom Half & Top Half)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB16 Rd, Rs1, Rs2
+ * SMBT16 Rd, Rs1, Rs2
+ * SMTT16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-
+ * bit content of the 32-bit elements of another register and write the result to a third register.
+ * * SMBB16: W[x].bottom*W[x].bottom
+ * * SMBT16: W[x].bottom *W[x].top
+ * * SMTT16: W[x].top * W[x].top
+ *
+ * **Description**:\n
+ * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16
+ * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMBT16(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.113.2. SMBT16 ===== */
+
+/* ===== Inline Function Start for 3.113.3. SMTT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMTT16 (SIMD Signed Multiply Top Half & Top Half)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB16 Rd, Rs1, Rs2
+ * SMBT16 Rd, Rs1, Rs2
+ * SMTT16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 16-bit content of the 32-bit elements of a register with the signed 16-
+ * bit content of the 32-bit elements of another register and write the result to a third register.
+ * * SMBB16: W[x].bottom*W[x].bottom
+ * * SMBT16: W[x].bottom *W[x].top
+ * * SMTT16: W[x].top * W[x].top
+ *
+ * **Description**:\n
+ * For the `SMBB16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMBT16` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMTT16` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2.
+ * The multiplication results are written to Rd. The 16-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[0]; // SMBB16
+ * Rd.W[x] = Rs1.W[x].H[0] * Rs2.W[x].H[1]; // SMBT16
+ * Rd.W[x] = Rs1.W[x].H[1] * Rs2.W[x].H[1]; // SMTT16
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMTT16(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.113.3. SMTT16 ===== */
+
+/* ===== Inline Function Start for 3.114.1. SMDS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMDS (SIMD Signed Multiply Two Halfs and Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS Rd, Rs1, Rs2
+ * SMDRS Rd, Rs1, Rs2
+ * SMXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results.
+ * * SMDS: top*top - bottom*bottom (per 32-bit element)
+ * * SMDRS: bottom*bottom - top*top (per 32-bit element)
+ * * SMXDS: top*bottom - bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with
+ * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result
+ * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
+ * 32-bit elements of Rs2.
+ * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of
+ * multiplication are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * SMDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * * SMDRS:
+ * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * * SMXDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMDS(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smds %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.114.1. SMDS ===== */
+
+/* ===== Inline Function Start for 3.114.2. SMDRS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMDRS (SIMD Signed Multiply Two Halfs and Reverse Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS Rd, Rs1, Rs2
+ * SMDRS Rd, Rs1, Rs2
+ * SMXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results.
+ * * SMDS: top*top - bottom*bottom (per 32-bit element)
+ * * SMDRS: bottom*bottom - top*top (per 32-bit element)
+ * * SMXDS: top*bottom - bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with
+ * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result
+ * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
+ * 32-bit elements of Rs2.
+ * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of
+ * multiplication are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * SMDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * * SMDRS:
+ * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * * SMXDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMDRS(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smdrs %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.114.2. SMDRS ===== */
+
+/* ===== Inline Function Start for 3.114.3. SMXDS ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_32B_ADDSUB
+ * \brief SMXDS (SIMD Signed Crossed Multiply Two Halfs and Subtract)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS Rd, Rs1, Rs2
+ * SMDRS Rd, Rs1, Rs2
+ * SMXDS Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * perform a subtraction operation between the two 32-bit results.
+ * * SMDS: top*top - bottom*bottom (per 32-bit element)
+ * * SMDRS: bottom*bottom - top*top (per 32-bit element)
+ * * SMXDS: top*bottom - bottom*top (per 32-bit element)
+ *
+ * **Description**:\n
+ * For the `SMDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1 with
+ * the bottom 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result
+ * of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the top 16-bit content of the
+ * 32-bit elements of Rs2.
+ * For the `SMDRS` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the result of
+ * multiplying the bottom 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit content of
+ * the 32-bit elements of Rs2.
+ * For the `SMXDS` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the top 16-bit content of the 32-bit elements of Rs2 and then subtracts the result from the
+ * result of multiplying the top 16-bit content of the 32-bit elements of Rs1 with the bottom 16-bit
+ * content of the 32-bit elements of Rs2.
+ * The subtraction result is written to the corresponding 32-bit element of Rd. The 16-bit contents of
+ * multiplication are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * SMDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[1]) - (Rs1.W[x].H[0] * Rs2.W[x].H[0]);
+ * * SMDRS:
+ * Rd.W[x] = (Rs1.W[x].H[0] * Rs2.W[x].H[0]) - (Rs1.W[x].H[1] * Rs2.W[x].H[1]);
+ * * SMXDS:
+ * Rd.W[x] = (Rs1.W[x].H[1] * Rs2.W[x].H[0]) - (Rs1.W[x].H[0] * Rs2.W[x].H[1]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMXDS(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smxds %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.114.3. SMXDS ===== */
+
+/* ===== Inline Function Start for 3.115. SMIN8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief SMIN8 (SIMD 8-bit Signed Minimum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMIN8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit signed integer elements in Rs1 with the 8-bit
+ * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] < Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMIN8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smin8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.115. SMIN8 ===== */
+
+/* ===== Inline Function Start for 3.116. SMIN16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief SMIN16 (SIMD 16-bit Signed Minimum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMIN16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit signed integer elements in Rs1 with the 16-bit
+ * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] < Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMIN16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smin16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.116. SMIN16 ===== */
+
+/* ===== Inline Function Start for 3.117.1. SMMUL ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief SMMUL (SIMD MSW Signed Multiply Word)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMUL Rd, Rs1, Rs2
+ * SMMUL.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed integer elements of two registers and write the most significant
+ * 32-bit results to the corresponding 32-bit elements of a register. The `.u` form performs an
+ * additional rounding up operation on the multiplication results before taking the most significant
+ * 32-bit part of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2 and writes the
+ * most significant 32-bit multiplication results to the corresponding 32-bit elements of Rd. The 32-bit
+ * elements of Rs1 and Rs2 are treated as signed integers. The `.u` form of the instruction rounds up
+ * the most significant 32-bit of the 64-bit multiplication results by adding a 1 to bit 31 of the results.
+ * * For `smmul/RV32` instruction, it is an alias to `mulh/RV32` instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][63:32];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMUL(long a, long b)
+{
+    register long result;
+    __ASM volatile("smmul %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.117.1. SMMUL ===== */
+
+/* ===== Inline Function Start for 3.117.2. SMMUL.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X32_MAC
+ * \brief SMMUL.u (SIMD MSW Signed Multiply Word with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMUL Rd, Rs1, Rs2
+ * SMMUL.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed integer elements of two registers and write the most significant
+ * 32-bit results to the corresponding 32-bit elements of a register. The `.u` form performs an
+ * additional rounding up operation on the multiplication results before taking the most significant
+ * 32-bit part of the results.
+ *
+ * **Description**:\n
+ * This instruction multiplies the 32-bit elements of Rs1 with the 32-bit elements of Rs2 and writes the
+ * most significant 32-bit multiplication results to the corresponding 32-bit elements of Rd. The 32-bit
+ * elements of Rs1 and Rs2 are treated as signed integers. The `.u` form of the instruction rounds up
+ * the most significant 32-bit of the 64-bit multiplication results by adding a 1 to bit 31 of the results.
+ * * For `smmul/RV32` instruction, it is an alias to `mulh/RV32` instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][63:0] = Rs1.W[x] * Rs2.W[x];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][63:31] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][63:32];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMUL_U(long a, long b)
+{
+    register long result;
+    __ASM volatile("smmul.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.117.2. SMMUL.u ===== */
+
+/* ===== Inline Function Start for 3.118.1. SMMWB ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief SMMWB (SIMD MSW Signed Multiply Word and Bottom Half)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMWB Rd, Rs1, Rs2
+ * SMMWB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
+ * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
+ * significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
+ * of the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
+ * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
+ * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][47:16];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMWB(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smmwb %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.118.1. SMMWB ===== */
+
+/* ===== Inline Function Start for 3.118.2. SMMWB.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief SMMWB.u (SIMD MSW Signed Multiply Word and Bottom Half with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMWB Rd, Rs1, Rs2
+ * SMMWB.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the bottom 16-bit of the
+ * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
+ * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
+ * significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the signed bottom 16-bit content
+ * of the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
+ * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
+ * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[0];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][47:16];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMWB_U(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smmwb.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.118.2. SMMWB.u ===== */
+
+/* ===== Inline Function Start for 3.119.1. SMMWT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief SMMWT (SIMD MSW Signed Multiply Word and Top Half)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMWT Rd, Rs1, Rs2
+ * SMMWT.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
+ * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
+ * significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the top signed 16-bit content of
+ * the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
+ * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
+ * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][47:16];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMWT(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smmwt %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.119.1. SMMWT ===== */
+
+/* ===== Inline Function Start for 3.119.2. SMMWT.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_MSW_32X16_MAC
+ * \brief SMMWT.u (SIMD MSW Signed Multiply Word and Top Half with Rounding)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMMWT Rd, Rs1, Rs2
+ * SMMWT.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit integer elements of one register and the top 16-bit of the
+ * corresponding 32-bit elements of another register, and write the most significant 32-bit results to
+ * the corresponding 32-bit elements of a register. The `.u` form rounds up the results from the most
+ * significant discarded bit.
+ *
+ * **Description**:\n
+ * This instruction multiplies the signed 32-bit elements of Rs1 with the top signed 16-bit content of
+ * the corresponding 32-bit elements of Rs2 and writes the most significant 32-bit multiplication
+ * results to the corresponding 32-bit elements of Rd. The `.u` form of the instruction rounds up the
+ * most significant 32-bit of the 48-bit multiplication results by adding a 1 to bit 15 of the results.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Mres[x][47:0] = Rs1.W[x] * Rs2.W[x].H[1];
+ * if (`.u` form) {
+ *   Round[x][32:0] = Mres[x][47:15] + 1;
+ *   Rd.W[x] = Round[x][32:1];
+ * } else {
+ *   Rd.W[x] = Mres[x][47:16];
+ * }
+ * for RV32: x=0
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMMWT_U(long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smmwt.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.119.2. SMMWT.u ===== */
+
+/* ===== Inline Function Start for 3.120.1. SMSLDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMSLDA (Signed Multiply Two Halfs & Add & Subtract 64-bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMSLDA Rd, Rs1, Rs2
+ * SMSLXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * subtracts the two 32-bit results from the 64-bit value of an even/odd pair of registers (RV32) or a
+ * register (RV64). The subtraction result is written back to the register-pair.
+ * * SMSLDA: rd pair - top*top - bottom*bottom (all 32-bit elements)
+ * * SMSLXDA: rd pair - top*bottom - bottom*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content Rs2 and multiplies the top 16-bit content of Rs1 with the top 16-bit content of Rs2.
+ * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and multiplies the bottom 16-bit content of Rs1 with the top 16-bit content of Rs2.
+ * The two multiplication results are subtracted from the 64-bit value of an even/odd pair of registers
+ * specified by Rd(4,1). The 64-bit subtraction result is written back to the register-pair. The 16-bit
+ * values of Rs1 and Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the bottom 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * The four multiplication results are subtracted from the 64-bit value of Rd. The 64-bit subtraction
+ * result is written back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated
+ * as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * // SMSLDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
+ * // SMSLXDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] - SE64(Mres0[31:0]) - SE64(Mres1[31:0]);
+ * * RV64:
+ * // SMSLDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMSLXDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
+ * Rd = Rd - SE64(Mres0[0][31:0]) - SE64(Mres1[0][31:0]) - SE64(Mres0[1][31:0]) -
+ * SE64(Mres1[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMSLDA(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smslda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.120.1. SMSLDA ===== */
+
+/* ===== Inline Function Start for 3.120.2. SMSLXDA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIGNED_16B_MULT_64B_ADDSUB
+ * \brief SMSLXDA (Signed Crossed Multiply Two Halfs & Add & Subtract 64- bit)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMSLDA Rd, Rs1, Rs2
+ * SMSLXDA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 16-bit multiplications from the 32-bit elements of two registers; and then
+ * subtracts the two 32-bit results from the 64-bit value of an even/odd pair of registers (RV32) or a
+ * register (RV64). The subtraction result is written back to the register-pair.
+ * * SMSLDA: rd pair - top*top - bottom*bottom (all 32-bit elements)
+ * * SMSLXDA: rd pair - top*bottom - bottom*top (all 32-bit elements)
+ *
+ * **RV32 Description**:\n
+ * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of Rs1 with the bottom 16-bit
+ * content Rs2 and multiplies the top 16-bit content of Rs1 with the top 16-bit content of Rs2.
+ * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of Rs1 with the bottom 16-bit
+ * content of Rs2 and multiplies the bottom 16-bit content of Rs1 with the top 16-bit content of Rs2.
+ * The two multiplication results are subtracted from the 64-bit value of an even/odd pair of registers
+ * specified by Rd(4,1). The 64-bit subtraction result is written back to the register-pair. The 16-bit
+ * values of Rs1 and Rs2, and the 64-bit value of the register-pair are treated as signed integers.
+ * Rd(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * For the `SMSLDA` instruction, it multiplies the bottom 16-bit content of the 32-bit elements of Rs1
+ * with the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the top 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * For the `SMSLXDA` instruction, it multiplies the top 16-bit content of the 32-bit elements of Rs1 with
+ * the bottom 16-bit content of the 32-bit elements of Rs2 and multiplies the bottom 16-bit content of
+ * the 32-bit elements of Rs1 with the top 16-bit content of the 32-bit elements of Rs2.
+ * The four multiplication results are subtracted from the 64-bit value of Rd. The 64-bit subtraction
+ * result is written back to Rd. The 16-bit values of Rs1 and Rs2, and the 64-bit value of Rd are treated
+ * as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * // SMSLDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[0]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[1]);
+ * // SMSLXDA
+ * Mres0[31:0] = (Rs1.H[0] * Rs2.H[1]);
+ * Mres1[31:0] = (Rs1.H[1] * Rs2.H[0]);
+ * Idx0 = CONCAT(Rd(4,1),1'b0); Idx1 = CONCAT(Rd(4,1),1'b1);
+ * R[Idx1].R[Idx0] = R[Idx1].R[Idx0] - SE64(Mres0[31:0]) - SE64(Mres1[31:0]);
+ * * RV64:
+ * // SMSLDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[0]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[1]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[0]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[1]);
+ * // SMSLXDA
+ * Mres0[0][31:0] = (Rs1.W[0].H[0] * Rs2.W[0].H[1]);
+ * Mres1[0][31:0] = (Rs1.W[0].H[1] * Rs2.W[0].H[0]);
+ * Mres0[1][31:0] = (Rs1.W[1].H[0] * Rs2.W[1].H[1]);
+ * Mres1[1][31:0] = (Rs1.W[1].H[1] * Rs2.W[1].H[0]);
+ * Rd = Rd - SE64(Mres0[0][31:0]) - SE64(Mres1[0][31:0]) - SE64(Mres0[1][31:0]) -
+ * SE64(Mres1[1][31:0]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMSLXDA(long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("smslxda %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.120.2. SMSLXDA ===== */
+
+/* ===== Inline Function Start for 3.121. SMSR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief SMSR64 (Signed Multiply and Subtract from 64- Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMSR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit signed elements in two registers and subtract the 64-bit multiplication
+ * results from the 64-bit signed data of a pair of registers (RV32) or a register (RV64). The result is
+ * written back to the pair of registers (RV32) or a register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit signed data of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication result from the 64-bit signed data of an even/odd pair of registers
+ * specified by Rd(4,1). The subtraction result is written back to the even/odd pair of registers
+ * specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit signed elements of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication results from the 64-bit signed data of Rd. The subtraction result is
+ * written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].R[t_L] = R[t_H].R[t_L] - (Rs1 * Rs2);
+ * * RV64:
+ * Rd = Rd - (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]);
+ * ~~~
+ *
+ * \param [in]  t    long long type of value stored in t
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    long type of value stored in b
+ * \return value stored in long long type
+ */
+__STATIC_FORCEINLINE long long __RV_SMSR64(long long t, long a, long b)
+{
+    __ASM volatile("smsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.121. SMSR64 ===== */
+
+/* ===== Inline Function Start for 3.122.1. SMUL8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief SMUL8 (SIMD Signed 8-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMUL8 Rd, Rs1, Rs2
+ * SMULX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do signed 8-bit multiplications and generate four 16-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
+ * corresponding 8-bit data elements of Rs2.
+ * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
+ * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
+ * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
+ * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
+ * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
+ * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
+ * part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
+ * corresponding 8-bit data elements of Rs2.
+ * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
+ * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
+ * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
+ * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
+ * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
+ * the bottom part of Rs1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `SMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `SMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] s* op2t[x/2];
+ * resb[x/2] = op1b[x/2] s* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
+ * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
+ * x = 0 and 2
+ * * RV64:
+ * if (is `SMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `SMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] s* op2t[x/2];
+ * resb[x/2] = op1b[x/2] s* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
+ * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0];
+ * x = 0 and 2
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_SMUL8(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("smul8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.122.1. SMUL8 ===== */
+
+/* ===== Inline Function Start for 3.122.2. SMULX8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief SMULX8 (SIMD Signed Crossed 8-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMUL8 Rd, Rs1, Rs2
+ * SMULX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do signed 8-bit multiplications and generate four 16-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
+ * corresponding 8-bit data elements of Rs2.
+ * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
+ * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
+ * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
+ * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
+ * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
+ * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
+ * part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `SMUL8` instruction, multiply the 8-bit data elements of Rs1 with the
+ * corresponding 8-bit data elements of Rs2.
+ * For the `SMULX8` instruction, multiply the first and second 8-bit data elements of Rs1 with the
+ * second and first 8-bit data elements of Rs2. At the same time, multiply the third and fourth 8-bit data
+ * elements of Rs1 with the fourth and third 8-bit data elements of Rs2.
+ * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
+ * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
+ * the bottom part of Rs1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `SMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `SMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] s* op2t[x/2];
+ * resb[x/2] = op1b[x/2] s* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
+ * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
+ * x = 0 and 2
+ * * RV64:
+ * if (is `SMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `SMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] s* op2t[x/2];
+ * resb[x/2] = op1b[x/2] s* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
+ * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0];
+ * x = 0 and 2
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_SMULX8(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("smulx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.122.2. SMULX8 ===== */
+
+/* ===== Inline Function Start for 3.123.1. SMUL16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief SMUL16 (SIMD Signed 16-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMUL16 Rd, Rs1, Rs2
+ * SMULX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do signed 16-bit multiplications and generate two 32-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of Rs1 with
+ * the top 16-bit Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1
+ * with the bottom 16-bit Q15 content of Rs2.
+ * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of Rs1 with the bottom 16-bit
+ * Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1 with the top 16-
+ * bit Q15 content of Rs2.
+ * The two Q30 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
+ * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
+ * register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
+ * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of the lower
+ * 32-bit word in Rs1 with the top 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time,
+ * multiply the bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the bottom 16-bit Q15
+ * content of the lower 32-bit word in Rs2.
+ * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of the lower 32-bit word in Rs1
+ * with the bottom 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time, multiply the
+ * bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the top 16-bit Q15 content of the
+ * lower 32-bit word in Rs2.
+ * The two 32-bit Q30 results are then written into Rd. The result calculated from the top 16-bit of the
+ * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
+ * the lower 32-bit word in Rs1 is written to Rd.W[0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `SMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `SMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop s* bop;
+ * }
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H] = rest;
+ * R[t_L] = resb;
+ * * RV64:
+ * if (is `SMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `SMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop s* bop;
+ * }
+ * Rd.W[1] = rest;
+ * Rd.W[0] = resb;
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_SMUL16(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("smul16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.123.1. SMUL16 ===== */
+
+/* ===== Inline Function Start for 3.123.2. SMULX16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief SMULX16 (SIMD Signed Crossed 16-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMUL16 Rd, Rs1, Rs2
+ * SMULX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do signed 16-bit multiplications and generate two 32-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of Rs1 with
+ * the top 16-bit Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1
+ * with the bottom 16-bit Q15 content of Rs2.
+ * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of Rs1 with the bottom 16-bit
+ * Q15 content of Rs2. At the same time, multiply the bottom 16-bit Q15 content of Rs1 with the top 16-
+ * bit Q15 content of Rs2.
+ * The two Q30 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
+ * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
+ * register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
+ * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `SMUL16` instruction, multiply the top 16-bit Q15 content of the lower
+ * 32-bit word in Rs1 with the top 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time,
+ * multiply the bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the bottom 16-bit Q15
+ * content of the lower 32-bit word in Rs2.
+ * For the `SMULX16` instruction, multiply the top 16-bit Q15 content of the lower 32-bit word in Rs1
+ * with the bottom 16-bit Q15 content of the lower 32-bit word in Rs2. At the same time, multiply the
+ * bottom 16-bit Q15 content of the lower 32-bit word in Rs1 with the top 16-bit Q15 content of the
+ * lower 32-bit word in Rs2.
+ * The two 32-bit Q30 results are then written into Rd. The result calculated from the top 16-bit of the
+ * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
+ * the lower 32-bit word in Rs1 is written to Rd.W[0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `SMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `SMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop s* bop;
+ * }
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H] = rest;
+ * R[t_L] = resb;
+ * * RV64:
+ * if (is `SMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `SMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop s* bop;
+ * }
+ * Rd.W[1] = rest;
+ * Rd.W[0] = resb;
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_SMULX16(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("smulx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.123.2. SMULX16 ===== */
+
+/* ===== Inline Function Start for 3.124. SRA.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief SRA.u (Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform an arithmetic right shift operation with rounding. The shift amount is a variable
+ * from a GPR.
+ *
+ * **Description**:\n
+ * This instruction right-shifts the content of Rs1 arithmetically. The shifted out bits are
+ * filled with the sign-bit and the shift amount is specified by the low-order 5-bits (RV32) or 6-bits
+ * (RV64) of the Rs2 register. For the rounding operation, a value of 1 is added to the most significant
+ * discarded bit of the data to calculate the final result. And the result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * sa = Rs2[4:0];
+ * if (sa > 0) {
+ *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
+ *   Rd = res[31:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * * RV64:
+ * sa = Rs2[5:0];
+ * if (sa > 0) {
+ *   res[63:-1] = SE65(Rs1[63:(sa-1)]) + 1;
+ *   Rd = res[63:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SRA_U(long a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("sra.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.124. SRA.u ===== */
+
+/* ===== Inline Function Start for 3.125. SRAI.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief SRAI.u (Rounding Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI.u Rd, Rs1, imm6u[4:0] (RV32)
+ * SRAI.u Rd, Rs1, imm6u[5:0] (RV64)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform an arithmetic right shift operation with rounding. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * This instruction right-shifts the content of Rs1 arithmetically. The shifted out bits are
+ * filled with the sign-bit and the shift amount is specified by the imm6u[4:0] (RV32) or imm6u[5:0]
+ * (RV64) constant . For the rounding operation, a value of 1 is added to the most significant discarded
+ * bit of the data to calculate the final result. And the result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * sa = imm6u[4:0];
+ * if (sa > 0) {
+ *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
+ *   Rd = res[31:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * * RV64:
+ * sa = imm6u[5:0];
+ * if (sa > 0) {
+ *   res[63:-1] = SE65(Rs1[63:(sa-1)]) + 1;
+ *   Rd = res[63:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+#define __RV_SRAI_U(a, b)    \
+    ({    \
+        register long result;    \
+        register long __a = (long)(a);    \
+        __ASM volatile("srai.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.125. SRAI.u ===== */
+
+/* ===== Inline Function Start for 3.126.1. SRA8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRA8 (SIMD 8-bit Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA8 Rd, Rs1, Rs2
+ * SRA8.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 3-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 8-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA8.u
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else { // SRA8
+ *     Rd.B[x] = SE8(Rd.B[x][7:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA8(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.126.1. SRA8 ===== */
+
+/* ===== Inline Function Start for 3.126.2. SRA8.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRA8.u (SIMD 8-bit Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA8 Rd, Rs1, Rs2
+ * SRA8.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 3-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 8-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA8.u
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else { // SRA8
+ *     Rd.B[x] = SE8(Rd.B[x][7:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA8_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sra8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.126.2. SRA8.u ===== */
+
+/* ===== Inline Function Start for 3.127.1. SRAI8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRAI8 (SIMD 8-bit Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI8 Rd, Rs1, imm3u
+ * SRAI8.u Rd, Rs1, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the imm3u
+ * constant. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 8-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA8.u
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else { // SRA8
+ *     Rd.B[x] = SE8(Rd.B[x][7:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRAI8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srai8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.127.1. SRAI8 ===== */
+
+/* ===== Inline Function Start for 3.127.2. SRAI8.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRAI8.u (SIMD 8-bit Rounding Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI8 Rd, Rs1, imm3u
+ * SRAI8.u Rd, Rs1, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit element arithmetic right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the imm3u
+ * constant. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 8-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA8.u
+ *     res[7:-1] = SE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[7:0];
+ *   } else { // SRA8
+ *     Rd.B[x] = SE8(Rd.B[x][7:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRAI8_U(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srai8.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.127.2. SRAI8.u ===== */
+
+/* ===== Inline Function Start for 3.128.1. SRA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRA16 (SIMD 16-bit Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA16 Rd, Rs1, Rs2
+ * SRA16.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 4-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 16-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * if (sa != 0) {
+ *   if (`.u` form) { // SRA16.u
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else { // SRA16
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("sra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.128.1. SRA16 ===== */
+
+/* ===== Inline Function Start for 3.128.2. SRA16.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRA16.u (SIMD 16-bit Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA16 Rd, Rs1, Rs2
+ * SRA16.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 4-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 16-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * if (sa != 0) {
+ *   if (`.u` form) { // SRA16.u
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else { // SRA16
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA16_U(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("sra16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.128.2. SRA16.u ===== */
+
+/* ===== Inline Function Start for 3.129.1. SRAI16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRAI16 (SIMD 16-bit Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI16 Rd, Rs1, imm4u
+ * SRAI16.u Rd, Rs1, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements arithmetic right shift operations simultaneously. The shift amount is
+ * an immediate value. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the 16-bit data elements. The shift amount is specified by the
+ * imm4u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
+ * significant discarded bit of each 16-bit data to calculate the final results. And the results are written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4u[3:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRAI16.u
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else { // SRAI16
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRAI16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srai16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.129.1. SRAI16 ===== */
+
+/* ===== Inline Function Start for 3.129.2. SRAI16.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRAI16.u (SIMD 16-bit Rounding Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI16 Rd, Rs1, imm4u
+ * SRAI16.u Rd, Rs1, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements arithmetic right shift operations simultaneously. The shift amount is
+ * an immediate value. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the 16-bit data elements. The shift amount is specified by the
+ * imm4u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
+ * significant discarded bit of each 16-bit data to calculate the final results. And the results are written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4u[3:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRAI16.u
+ *     res[15:-1] = SE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[15:0];
+ *   } else { // SRAI16
+ *     Rd.H[x] = SE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRAI16_U(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srai16.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.129.2. SRAI16.u ===== */
+
+/* ===== Inline Function Start for 3.130.1. SRL8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRL8 (SIMD 8-bit Shift Right Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL8 Rt, Ra, Rb
+ * SRL8.u Rt, Ra, Rb
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
+ * filled with zero. The shift amount is specified by the low-order 3-bits of the value in the Rs2 register.
+ * For the rounding operation of the `.u` form, a value of 1 is added to the most significant discarded
+ * bit of each 8-bit data element to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRL8.u
+ *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[8:1];
+ *   } else { // SRL8
+ *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL8(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.130.1. SRL8 ===== */
+
+/* ===== Inline Function Start for 3.130.2. SRL8.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRL8.u (SIMD 8-bit Rounding Shift Right Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL8 Rt, Ra, Rb
+ * SRL8.u Rt, Ra, Rb
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
+ * filled with zero. The shift amount is specified by the low-order 3-bits of the value in the Rs2 register.
+ * For the rounding operation of the `.u` form, a value of 1 is added to the most significant discarded
+ * bit of each 8-bit data element to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRL8.u
+ *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[8:1];
+ *   } else { // SRL8
+ *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL8_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl8.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.130.2. SRL8.u ===== */
+
+/* ===== Inline Function Start for 3.131.1. SRLI8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRLI8 (SIMD 8-bit Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI8 Rt, Ra, imm3u
+ * SRLI8.u Rt, Ra, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
+ * filled with zero. The shift amount is specified by the imm3u constant. For the rounding operation of
+ * the `.u` form, a value of 1 is added to the most significant discarded bit of each 8-bit data element to
+ * calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI8.u
+ *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[8:1];
+ *   } else { // SRLI8
+ *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRLI8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srli8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.131.1. SRLI8 ===== */
+
+/* ===== Inline Function Start for 3.131.2. SRLI8.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_SHIFT
+ * \brief SRLI8.u (SIMD 8-bit Rounding Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI8 Rt, Ra, imm3u
+ * SRLI8.u Rt, Ra, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 8-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits are
+ * filled with zero. The shift amount is specified by the imm3u constant. For the rounding operation of
+ * the `.u` form, a value of 1 is added to the most significant discarded bit of each 8-bit data element to
+ * calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm3u[2:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI8.u
+ *     res[8:0] = ZE9(Rs1.B[x][7:sa-1]) + 1;
+ *     Rd.B[x] = res[8:1];
+ *   } else { // SRLI8
+ *     Rd.B[x] = ZE8(Rs1.B[x][7:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRLI8_U(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srli8.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.131.2. SRLI8.u ===== */
+
+/* ===== Inline Function Start for 3.132.1. SRL16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRL16 (SIMD 16-bit Shift Right Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL16 Rt, Ra, Rb
+ *  SRL16.u Rt, Ra, Rb
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical right shift operations simultaneously. The shift amount is a variable from a GPR. The `.u` form performs additional rounding upoperations on the shifted results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the low-order 4-bits of the value in the Rs2
+ * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 16-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRL16.u
+ *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[16:1];
+ *   } else { // SRL16
+ *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL16(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.132.1. SRL16 ===== */
+
+/* ===== Inline Function Start for 3.132.2. SRL16.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRL16.u (SIMD 16-bit Rounding Shift Right Logical)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL16 Rt, Ra, Rb
+ *  SRL16.u Rt, Ra, Rb
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical right shift operations simultaneously. The shift amount is a variable from a GPR. The `.u` form performs additional rounding upoperations on the shifted results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the low-order 4-bits of the value in the Rs2
+ * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 16-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[3:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRL16.u
+ *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[16:1];
+ *   } else { // SRL16
+ *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL16_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl16.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.132.2. SRL16.u ===== */
+
+/* ===== Inline Function Start for 3.133.1. SRLI16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRLI16 (SIMD 16-bit Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI16 Rt, Ra, imm4u
+ * SRLI16.u Rt, Ra, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the imm4u constant. For the rounding
+ * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 16-bit
+ * data element to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4u;
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI16.u
+ *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[16:1];
+ *   } else { // SRLI16
+ *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRLI16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srli16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.133.1. SRLI16 ===== */
+
+/* ===== Inline Function Start for 3.133.2. SRLI16.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_SHIFT
+ * \brief SRLI16.u (SIMD 16-bit Rounding Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI16 Rt, Ra, imm4u
+ * SRLI16.u Rt, Ra, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 16-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the imm4u constant. For the rounding
+ * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 16-bit
+ * data element to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm4u;
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI16.u
+ *     res[16:0] = ZE17(Rs1.H[x][15:sa-1]) + 1;
+ *     Rd.H[x] = res[16:1];
+ *   } else { // SRLI16
+ *     Rd.H[x] = ZE16(Rs1.H[x][15:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_SRLI16_U(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("srli16.u %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.133.2. SRLI16.u ===== */
+
+/* ===== Inline Function Start for 3.134. STAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief STAS16 (SIMD 16-bit Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * STAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element addition and 16-bit integer element subtraction in a 32-bit
+ * chunk simultaneously. Operands are from corresponding positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit integer element in [31:16] of 32-bit chunks in Rs1 with
+ * the 16-bit integer element in [31:16] of 32-bit chunks in Rs2, and writes the result to [31:16] of 32-bit
+ * chunks in Rd; at the same time, it subtracts the 16-bit integer element in [15:0] of 32-bit chunks in
+ * Rs2 from the 16-bit integer element in [15:0] of 32-bit chunks, and writes the result to [15:0] of 32-
+ * bit chunks in Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = Rs1.W[x][31:16] + Rs2.W[x][31:16];
+ * Rd.W[x][15:0] = Rs1.W[x][15:0] - Rs2.W[x][15:0];
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_STAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("stas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.134. STAS16 ===== */
+
+/* ===== Inline Function Start for 3.135. STSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief STSA16 (SIMD 16-bit Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * STSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element subtraction and 16-bit integer element addition in a 32-bit
+ * chunk simultaneously. Operands are from corresponding positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit integer element in [31:16] of 32-bit chunks in Rs2
+ * from the 16-bit integer element in [31:16] of 32-bit chunks in Rs1, and writes the result to [31:16] of
+ * 32-bit chunks in Rd; at the same time, it adds the 16-bit integer element in [15:0] of 32-bit chunks in
+ * Rs2 with the 16-bit integer element in [15:0] of 32-bit chunks in Rs1, and writes the result to [15:0] of
+ * 32-bit chunks in Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = Rs1.W[x][31:16] - Rs2.W[x][31:16];
+ * Rd.W[x][15:0] = Rs1.W[x][15:0] + Rs2.W[x][15:0];
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_STSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("stsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.135. STSA16 ===== */
+
+/* ===== Inline Function Start for 3.136. SUB8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief SUB8 (SIMD 8-bit Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUB8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit integer element subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit integer elements in Rs2 from the 8-bit integer
+ * elements in Rs1, and then writes the result to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = Rs1.B[x] - Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUB8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("sub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.136. SUB8 ===== */
+
+/* ===== Inline Function Start for 3.137. SUB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief SUB16 (SIMD 16-bit Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit integer element subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit integer elements in Rs2 from the 16-bit integer
+ * elements in Rs1, and then writes the result to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = Rs1.H[x] - Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("sub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.137. SUB16 ===== */
+
+/* ===== Inline Function Start for 3.138. SUB64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief SUB64 (64-bit Subtraction)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUB64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 64-bit signed or unsigned integer subtraction.
+ *
+ * **RV32 Description**:\n
+ * This instruction subtracts the 64-bit integer of an even/odd pair of registers
+ * specified by Rs2(4,1) from the 64-bit integer of an even/odd pair of registers specified by Rs1(4,1),
+ * and then writes the 64-bit result to an even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * This instruction subtracts the 64-bit integer of Rs2 from the 64-bit integer of Rs1,
+ * and then writes the 64-bit result to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ * R[t_H].R[t_L] = R[a_H].R[a_L] - R[b_H].R[b_L];
+ * * RV64:
+ * Rd = Rs1 - Rs2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_SUB64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("sub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.138. SUB64 ===== */
+
+/* ===== Inline Function Start for 3.139.1. SUNPKD810 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief SUNPKD810 (Signed Unpacking Bytes 1 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
+ * of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
+ * // SUNPKD810, x=1,y=0
+ * // SUNPKD820, x=2,y=0
+ * // SUNPKD830, x=3,y=0
+ * // SUNPKD831, x=3,y=1
+ * // SUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUNPKD810(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("sunpkd810 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.139.1. SUNPKD810 ===== */
+
+/* ===== Inline Function Start for 3.139.2. SUNPKD820 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief SUNPKD820 (Signed Unpacking Bytes 2 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
+ * of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
+ * // SUNPKD810, x=1,y=0
+ * // SUNPKD820, x=2,y=0
+ * // SUNPKD830, x=3,y=0
+ * // SUNPKD831, x=3,y=1
+ * // SUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUNPKD820(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("sunpkd820 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.139.2. SUNPKD820 ===== */
+
+/* ===== Inline Function Start for 3.139.3. SUNPKD830 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief SUNPKD830 (Signed Unpacking Bytes 3 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
+ * of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
+ * // SUNPKD810, x=1,y=0
+ * // SUNPKD820, x=2,y=0
+ * // SUNPKD830, x=3,y=0
+ * // SUNPKD831, x=3,y=1
+ * // SUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUNPKD830(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("sunpkd830 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.139.3. SUNPKD830 ===== */
+
+/* ===== Inline Function Start for 3.139.4. SUNPKD831 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief SUNPKD831 (Signed Unpacking Bytes 3 & 1)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
+ * of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
+ * // SUNPKD810, x=1,y=0
+ * // SUNPKD820, x=2,y=0
+ * // SUNPKD830, x=3,y=0
+ * // SUNPKD831, x=3,y=1
+ * // SUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUNPKD831(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("sunpkd831 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.139.4. SUNPKD831 ===== */
+
+/* ===== Inline Function Start for 3.139.5. SUNPKD832 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief SUNPKD832 (Signed Unpacking Bytes 3 & 2)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte *x and byte y* of 32-bit chunks in a register into two 16-bit signed halfwords
+ * of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `SUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit signed halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = SE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = SE16(Rs1.W[m].B[y])
+ * // SUNPKD810, x=1,y=0
+ * // SUNPKD820, x=2,y=0
+ * // SUNPKD830, x=3,y=0
+ * // SUNPKD831, x=3,y=1
+ * // SUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUNPKD832(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("sunpkd832 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.139.5. SUNPKD832 ===== */
+
+/* ===== Inline Function Start for 3.140. SWAP8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief SWAP8 (Swap Byte within Halfword)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SWAP8 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Swap the bytes within each halfword of a register.
+ *
+ * **Description**:\n
+ * This instruction swaps the bytes within each halfword of Rs1 and writes the result to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = CONCAT(Rs1.H[x][7:0],Rs1.H[x][15:8]);
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SWAP8(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("swap8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.140. SWAP8 ===== */
+
+/* ===== Inline Function Start for 3.141. SWAP16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief SWAP16 (Swap Halfword within Word)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SWAP16 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Swap the 16-bit halfwords within each word of a register.
+ *
+ * **Description**:\n
+ * This instruction swaps the 16-bit halfwords within each word of Rs1 and writes the
+ * result to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = CONCAT(Rs1.W[x][15:0],Rs1.H[x][31:16]);
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SWAP16(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("swap16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.141. SWAP16 ===== */
+
+/* ===== Inline Function Start for 3.142. UCLIP8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief UCLIP8 (SIMD 8-bit Unsigned Clip Value)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCLIP8 Rt, Ra, imm3u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 8-bit signed elements of a register into an unsigned range simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 8-bit signed elements stored in Rs1 into an unsigned integer
+ * range between 2^imm3u-1 and 0, and writes the limited results to Rd. For example, if imm3u is 3, the 8-
+ * bit input values should be saturated between 7 and 0. If saturation is performed, set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.H[x];
+ * if (src > (2^imm3u)-1) {
+ *   src = (2^imm3u)-1;
+ *   OV = 1;
+ * } else if (src < 0) {
+ *   src = 0;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = src;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_UCLIP8(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("uclip8 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.142. UCLIP8 ===== */
+
+/* ===== Inline Function Start for 3.143. UCLIP16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief UCLIP16 (SIMD 16-bit Unsigned Clip Value)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCLIP16 Rt, Ra, imm4u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 16-bit signed elements of a register into an unsigned range simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 16-bit signed elements stored in Rs1 into an unsigned
+ * integer range between 2imm4u-1 and 0, and writes the limited results to Rd. For example, if imm4u is
+ * 3, the 16-bit input values should be saturated between 7 and 0. If saturation is performed, set OV bit
+ * to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.H[x];
+ * if (src > (2^imm4u)-1) {
+ *   src = (2^imm4u)-1;
+ *   OV = 1;
+ * } else if (src < 0) {
+ *   src = 0;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = src;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_UCLIP16(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("uclip16 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.143. UCLIP16 ===== */
+
+/* ===== Inline Function Start for 3.144. UCLIP32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_PART_SIMD_MISC
+ * \brief UCLIP32 (SIMD 32-bit Unsigned Clip Value)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCLIP32 Rd, Rs1, imm5u[4:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Limit the 32-bit signed integer elements of a register into an unsigned range
+ * simultaneously.
+ *
+ * **Description**:\n
+ * This instruction limits the 32-bit signed integer elements stored in Rs1 into an
+ * unsigned integer range between 2imm5u-1 and 0, and writes the limited results to Rd. For example, if
+ * imm5u is 3, the 32-bit input values should be saturated between 7 and 0. If saturation is performed,
+ * set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.W[x];
+ * if (src > (2^imm5u)-1) {
+ *   src = (2^imm5u)-1;
+ *   OV = 1;
+ * } else if (src < 0) {
+ *   src = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = src
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_UCLIP32(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register unsigned long __a = (unsigned long)(a);    \
+        __ASM volatile("uclip32 %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.144. UCLIP32 ===== */
+
+/* ===== Inline Function Start for 3.145. UCMPLE8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
+ * \brief UCMPLE8 (SIMD 8-bit Unsigned Compare Less Than & Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCMPLE8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer elements less than & equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit
+ * unsigned integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it
+ * is true, the result is 0xFF; otherwise, the result is 0x0. The four comparison results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] <=u Rs2.B[x])? 0xff : 0x0;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UCMPLE8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ucmple8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.145. UCMPLE8 ===== */
+
+/* ===== Inline Function Start for 3.146. UCMPLE16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
+ * \brief UCMPLE16 (SIMD 16-bit Unsigned Compare Less Than & Equal)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCMPLE16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer elements less than & equal comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2 to see if the one in Rs1 is less than or equal to the one in Rs2. If it
+ * is true, the result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] <=u Rs2.H[x])? 0xffff : 0x0;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UCMPLE16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ucmple16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.146. UCMPLE16 ===== */
+
+/* ===== Inline Function Start for 3.147. UCMPLT8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_CMP
+ * \brief UCMPLT8 (SIMD 8-bit Unsigned Compare Less Than)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCMPLT8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer elements less than comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit
+ * unsigned integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
+ * result is 0xFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] <u Rs2.B[x])? 0xff : 0x0;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UCMPLT8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ucmplt8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.147. UCMPLT8 ===== */
+
+/* ===== Inline Function Start for 3.148. UCMPLT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_CMP
+ * \brief UCMPLT16 (SIMD 16-bit Unsigned Compare Less Than)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UCMPLT16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer elements less than comparisons simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2 to see if the one in Rs1 is less than the one in Rs2. If it is true, the
+ * result is 0xFFFF; otherwise, the result is 0x0. The element comparison results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] <u Rs2.H[x])? 0xffff : 0x0;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UCMPLT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ucmplt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.148. UCMPLT16 ===== */
+
+/* ===== Inline Function Start for 3.149. UKADD8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief UKADD8 (SIMD 8-bit Unsigned Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADD8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit unsigned integer elements in Rs1 with the 8-bit
+ * unsigned integer elements in Rs2. If any of the results are beyond the 8-bit unsigned number range
+ * (0 <= RES <= 28-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] + Rs2.B[x];
+ * if (res[x] > (2^8)-1) {
+ *   res[x] = (2^8)-1;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKADD8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.149. UKADD8 ===== */
+
+/* ===== Inline Function Start for 3.150. UKADD16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKADD16 (SIMD 16-bit Unsigned Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADD16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2. If any of the results are beyond the 16-bit unsigned number
+ * range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] + Rs2.H[x];
+ * if (res[x] > (2^16)-1) {
+ *   res[x] = (2^16)-1;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKADD16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.150. UKADD16 ===== */
+
+/* ===== Inline Function Start for 3.151. UKADD64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief UKADD64 (64-bit Unsigned Saturating Addition)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADD64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add two 64-bit unsigned integers. The result is saturated to the U64 range.
+ *
+ * **RV32 Description**:\n
+ * This instruction adds the 64-bit unsigned integer of an even/odd pair of registers
+ * specified by Rs1(4,1) with the 64-bit unsigned integer of an even/odd pair of registers specified by
+ * Rs2(4,1). If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to the
+ * range and the OV bit is set to 1. The saturated result is written to an even/odd pair of registers
+ * specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction adds the 64-bit unsigned integer in Rs1 with the 64-bit unsigned
+ * integer in Rs2. If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to
+ * the range and the OV bit is set to 1. The saturated result is written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1);
+ * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1);
+ * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1);
+ * result = R[a_H].R[a_L] + R[b_H].R[b_L];
+ * if (result > (2^64)-1) {
+ *   result = (2^64)-1; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * * RV64:
+ * result = Rs1 + Rs2;
+ * if (result > (2^64)-1) {
+ *   result = (2^64)-1; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UKADD64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("ukadd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.151. UKADD64 ===== */
+
+/* ===== Inline Function Start for 3.152. UKADDH ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief UKADDH (Unsigned Addition with U16 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADDH Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add the unsigned lower 32-bit content of two registers with U16 saturation.
+ *
+ * **Description**:\n
+ * The unsigned lower 32-bit content of Rs1 is added with the unsigned lower 32-bit
+ * content of Rs2. And the result is saturated to the 16-bit unsigned integer range of [0, 2^16-1] and then
+ * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] + Rs2.W[0];
+ * if (tmp > (2^16)-1) {
+ *   tmp = (2^16)-1;
+ *   OV = 1;
+ * }
+ * Rd = SE(tmp[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKADDH(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ukaddh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.152. UKADDH ===== */
+
+/* ===== Inline Function Start for 3.153. UKADDW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief UKADDW (Unsigned Addition with U32 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADDW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add the unsigned lower 32-bit content of two registers with U32 saturation.
+ *
+ * **Description**:\n
+ * The unsigned lower 32-bit content of Rs1 is added with the unsigned lower 32-bit
+ * content of Rs2. And the result is saturated to the 32-bit unsigned integer range of [0, 2^32-1] and then
+ * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] + Rs2.W[0];
+ * if (tmp > (2^32)-1) {
+ *   tmp[31:0] = (2^32)-1;
+ *   OV = 1;
+ * }
+ * Rd = tmp[31:0]; // RV32
+ * Rd = SE(tmp[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKADDW(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ukaddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.153. UKADDW ===== */
+
+/* ===== Inline Function Start for 3.154. UKCRAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKCRAS16 (SIMD 16-bit Unsigned Saturating Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKCRAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 16-bit unsigned integer element saturating addition and one 16-bit unsigned
+ * integer element saturating subtraction in a 32-bit chunk simultaneously. Operands are from crossed
+ * positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2; at the same time, it
+ * subtracts the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2 from the 16-bit
+ * unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the 16-bit
+ * unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1.
+ * The saturated results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit
+ * chunks in Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] + Rs2.W[x][15:0];
+ * res2 = Rs1.W[x][15:0] - Rs2.W[x][31:16];
+ * if (res1 > (2^16)-1) {
+ *   res1 = (2^16)-1;
+ *   OV = 1;
+ * }
+ * if (res2 < 0) {
+ *   res2 = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKCRAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.154. UKCRAS16 ===== */
+
+/* ===== Inline Function Start for 3.155. UKCRSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKCRSA16 (SIMD 16-bit Unsigned Saturating Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKCRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 16-bit unsigned integer element saturating subtraction and one 16-bit unsigned
+ * integer element saturating addition in a 32-bit chunk simultaneously. Operands are from crossed
+ * positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer element in [15:0] of 32-bit
+ * chunks in Rs2 from the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs1; at the
+ * same time, it adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2 with the 16-
+ * bit unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the
+ * 16-bit unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set
+ * to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of
+ * 32-bit chunks in Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] - Rs2.W[x][15:0];
+ * res2 = Rs1.W[x][15:0] + Rs2.W[x][31:16];
+ * if (res1 < 0) {
+ *   res1 = 0;
+ *   OV = 1;
+ * } else if (res2 > (2^16)-1) {
+ *   res2 = (2^16)-1;
+ *   OV = 1;
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKCRSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.155. UKCRSA16 ===== */
+
+/* ===== Inline Function Start for 3.156. UKMAR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief UKMAR64 (Unsigned Multiply and Saturating Add to 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKMAR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit unsigned elements in two registers and add the 64-bit multiplication
+ * results to the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64). The result is
+ * saturated to the U64 range and written back to the pair of registers (RV32) or the register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
+ * adds the 64-bit multiplication result to the 64-bit unsigned data of an even/odd pair of registers
+ * specified by Rd(4,1) with unlimited precision. If the 64-bit addition result is beyond the U64 number
+ * range (0 <= U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The saturated result is
+ * written back to the even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
+ * It adds the 64-bit multiplication results to the 64-bit unsigned data in Rd with unlimited precision. If
+ * the 64-bit addition result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is saturated to the
+ * range and the OV bit is set to 1. The saturated result is written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * result = R[t_H].R[t_L] + (Rs1 * Rs2);
+ * if (result > (2^64)-1) {
+ *   result = (2^64)-1; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * * RV64:
+ * // `result` has unlimited precision
+ * result = Rd + (Rs1.W[0] u* Rs2.W[0]) + (Rs1.W[1] u* Rs2.W[1]);
+ * if (result > (2^64)-1) {
+ *   result = (2^64)-1; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  t    unsigned long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UKMAR64(unsigned long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("ukmar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.156. UKMAR64 ===== */
+
+/* ===== Inline Function Start for 3.157. UKMSR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief UKMSR64 (Unsigned Multiply and Saturating Subtract from 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKMSR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit unsigned elements in two registers and subtract the 64-bit
+ * multiplication results from the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64).
+ * The result is saturated to the U64 range and written back to the pair of registers (RV32) or a register
+ * (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication result from the 64-bit unsigned data of an even/odd pair of
+ * registers specified by Rd(4,1) with unlimited precision. If the 64-bit subtraction result is beyond the
+ * U64 number range (0 <= U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The
+ * saturated result is written back to the even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
+ * It subtracts the 64-bit multiplication results from the 64-bit unsigned data of Rd with unlimited
+ * precision. If the 64-bit subtraction result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is
+ * saturated to the range and the OV bit is set to 1. The saturated result is written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * result = R[t_H].R[t_L] - (Rs1 u* Rs2);
+ * if (result < 0) {
+ *   result = 0; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * * RV64:
+ * // `result` has unlimited precision
+ * result = Rd - (Rs1.W[0] u* Rs2.W[0]) - (Rs1.W[1] u* Rs2.W[1]);
+ * if (result < 0) {
+ *   result = 0; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  t    unsigned long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UKMSR64(unsigned long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("ukmsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.157. UKMSR64 ===== */
+
+/* ===== Inline Function Start for 3.158. UKSTAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKSTAS16 (SIMD 16-bit Unsigned Saturating Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSTAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 16-bit unsigned integer element saturating addition and one 16-bit unsigned
+ * integer element saturating subtraction in a 32-bit chunk simultaneously. Operands are from
+ * corresponding positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer element in [31:16] of 32-bit chunks in
+ * Rs1 with the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs2; at the same time, it
+ * subtracts the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2 from the 16-bit
+ * unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the 16-bit
+ * unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1.
+ * The saturated results are written to [31:16] of 32-bit chunks in Rd for addition and [15:0] of 32-bit
+ * chunks in Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] + Rs2.W[x][31:16];
+ * res2 = Rs1.W[x][15:0] - Rs2.W[x][15:0];
+ * if (res1 > (2^16)-1) {
+ *   res1 = (2^16)-1;
+ *   OV = 1;
+ * }
+ * if (res2 < 0) {
+ *   res2 = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSTAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.158. UKSTAS16 ===== */
+
+/* ===== Inline Function Start for 3.159. UKSTSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKSTSA16 (SIMD 16-bit Unsigned Saturating Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSTSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 16-bit unsigned integer element saturating subtraction and one 16-bit unsigned
+ * integer element saturating addition in a 32-bit chunk simultaneously. Operands are from
+ * corresponding positions in 32-bit chunks.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer element in [31:16] of 32-bit
+ * chunks in Rs2 from the 16-bit unsigned integer element in [31:16] of 32-bit chunks in Rs1; at the
+ * same time, it adds the 16-bit unsigned integer element in [15:0] of 32-bit chunks in Rs2 with the 16-
+ * bit unsigned integer element in [15:0] of 32-bit chunks in Rs1. If any of the results are beyond the
+ * 16-bit unsigned number range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set
+ * to 1. The saturated results are written to [31:16] of 32-bit chunks in Rd for subtraction and [15:0] of
+ * 32-bit chunks in Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[x][31:16] - Rs2.W[x][31:16];
+ * res2 = Rs1.W[x][15:0] + Rs2.W[x][15:0];
+ * if (res1 < 0) {
+ *   res1 = 0;
+ *   OV = 1;
+ * } else if (res2 > (2^16)-1) {
+ *   res2 = (2^16)-1;
+ *   OV = 1;
+ * }
+ * Rd.W[x][31:16] = res1;
+ * Rd.W[x][15:0] = res2;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSTSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.159. UKSTSA16 ===== */
+
+/* ===== Inline Function Start for 3.160. UKSUB8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief UKSUB8 (SIMD 8-bit Unsigned Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUB8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit unsigned integer elements in Rs2 from the 8-bit
+ * unsigned integer elements in Rs1. If any of the results are beyond the 8-bit unsigned number range
+ * (0 <= RES <= 28-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] - Rs2.B[x];
+ * if (res[x] < 0) {
+ *   res[x] = 0;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSUB8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.160. UKSUB8 ===== */
+
+/* ===== Inline Function Start for 3.161. UKSUB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief UKSUB16 (SIMD 16-bit Unsigned Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer elements in Rs2 from the 16-bit
+ * unsigned integer elements in Rs1. If any of the results are beyond the 16-bit unsigned number
+ * range (0 <= RES <= 2^16-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] - Rs2.H[x];
+ * if (res[x] < 0) {
+ *   res[x] = 0;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSUB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.161. UKSUB16 ===== */
+
+/* ===== Inline Function Start for 3.162. UKSUB64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief UKSUB64 (64-bit Unsigned Saturating Subtraction)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUB64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 64-bit signed integer subtraction. The result is saturated to the U64 range.
+ *
+ * **RV32 Description**:\n
+ * This instruction subtracts the 64-bit unsigned integer of an even/odd pair of
+ * registers specified by Rs2(4,1) from the 64-bit unsigned integer of an even/odd pair of registers
+ * specified by Rs1(4,1). If the 64-bit result is beyond the U64 number range (0 <= U64 <= 2^64-1), it is
+ * saturated to the range and the OV bit is set to 1. The saturated result is then written to an even/odd
+ * pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the operand and the even `2d`
+ * register of the pair contains the low 32-bit of the operand.
+ *
+ * **RV64 Description**:\n
+ * This instruction subtracts the 64-bit unsigned integer of Rs2 from the 64-bit
+ * unsigned integer of an even/odd pair of Rs1. If the 64-bit result is beyond the U64 number range (0 <=
+ * U64 <= 2^64-1), it is saturated to the range and the OV bit is set to 1. The saturated result is then written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * a_L = CONCAT(Rs1(4,1),1'b0); a_H = CONCAT(Rs1(4,1),1'b1);
+ * b_L = CONCAT(Rs2(4,1),1'b0); b_H = CONCAT(Rs2(4,1),1'b1);
+ * result = R[a_H].R[a_L] - R[b_H].R[b_L];
+ * if (result < 0) {
+ *   result = 0; OV = 1;
+ * }
+ * R[t_H].R[t_L] = result;
+ * * RV64
+ * result = Rs1 - Rs2;
+ * if (result < 0) {
+ *   result = 0; OV = 1;
+ * }
+ * Rd = result;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UKSUB64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("uksub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.162. UKSUB64 ===== */
+
+/* ===== Inline Function Start for 3.163. UKSUBH ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q15_SAT_ALU
+ * \brief UKSUBH (Unsigned Subtraction with U16 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUBH Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract the unsigned lower 32-bit content of two registers with U16 saturation.
+ *
+ * **Description**:\n
+ * The unsigned lower 32-bit content of Rs2 is subtracted from the unsigned lower 32-bit
+ * content of Rs1. And the result is saturated to the 16-bit unsigned integer range of [0, 2^16-1] and then
+ * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] - Rs2.W[0];
+ * if (tmp > (2^16)-1) {
+ *   tmp = (2^16)-1;
+ *   OV = 1;
+ * }
+ * else if (tmp < 0) {
+ *   tmp = 0;
+ *   OV = 1;
+ * }
+ * Rd = SE(tmp[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSUBH(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("uksubh %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.163. UKSUBH ===== */
+
+/* ===== Inline Function Start for 3.164. UKSUBW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_Q31_SAT_ALU
+ * \brief UKSUBW (Unsigned Subtraction with U32 Saturation)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUBW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract the unsigned lower 32-bit content of two registers with unsigned 32-bit
+ * saturation.
+ *
+ * **Description**:\n
+ * The unsigned lower 32-bit content of Rs2 is subtracted from the unsigned lower 32-bit
+ * content of Rs1. And the result is saturated to the 32-bit unsigned integer range of [0, 2^32-1] and then
+ * sign-extended and written to Rd. If saturation happens, this instruction sets the OV flag.
+ *
+ * **Operations**:\n
+ * ~~~
+ * tmp = Rs1.W[0] - Rs2.W[0];
+ * if (tmp < 0) {
+ *   tmp[31:0] = 0;
+ *   OV = 1;
+ * }
+ * Rd = tmp[31:0]; // RV32
+ * Rd = SE(tmp[31:0]); // RV64
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSUBW(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("uksubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.164. UKSUBW ===== */
+
+/* ===== Inline Function Start for 3.165. UMAR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief UMAR64 (Unsigned Multiply and Add to 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMAR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit unsigned elements in two registers and add the 64-bit multiplication
+ * results to the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64). The result is
+ * written back to the pair of registers (RV32) or a register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
+ * adds the 64-bit multiplication result to the 64-bit unsigned data of an even/odd pair of registers
+ * specified by Rd(4,1). The addition result is written back to the even/odd pair of registers specified by
+ * Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
+ * It adds the 64-bit multiplication results to the 64-bit unsigned data of Rd. The addition result is
+ * written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].R[t_L] = R[t_H].R[t_L] + (Rs1 * Rs2);
+ * * RV64:
+ * Rd = Rd + (Rs1.W[0] u* Rs2.W[0]) + (Rs1.W[1] u* Rs2.W[1]);
+ * ~~~
+ *
+ * \param [in]  t    unsigned long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMAR64(unsigned long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("umar64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.165. UMAR64 ===== */
+
+/* ===== Inline Function Start for 3.166. UMAQA ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_8B_MULT_32B_ADD
+ * \brief UMAQA (Unsigned Multiply Four Bytes with 32- bit Adds)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMAQA Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do four unsigned 8-bit multiplications from 32-bit chunks of two registers; and then adds
+ * the four 16-bit results and the content of corresponding 32-bit chunks of a third register together.
+ *
+ * **Description**:\n
+ * This instruction multiplies the four unsigned 8-bit elements of 32-bit chunks of Rs1 with the four
+ * unsigned 8-bit elements of 32-bit chunks of Rs2 and then adds the four results together with the
+ * unsigned content of the corresponding 32-bit chunks of Rd. The final results are written back to the
+ * corresponding 32-bit chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rd.W[x] + (Rs1.W[x].B[3] u* Rs2.W[x].B[3]) +
+ *          (Rs1.W[x].B[2] u* Rs2.W[x].B[2]) + (Rs1.W[x].B[1] u* Rs2.W[x].B[1]) +
+ *          (Rs1.W[x].B[0] u* Rs2.W[x].B[0]);
+ * Rd.W[x] = res[x];
+ * for RV32: x=0,
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMAQA(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("umaqa %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.166. UMAQA ===== */
+
+/* ===== Inline Function Start for 3.167. UMAX8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief UMAX8 (SIMD 8-bit Unsigned Maximum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMAX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit unsigned integer elements in Rs1 with the four 8-
+ * bit unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * two selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] >u Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMAX8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umax8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.167. UMAX8 ===== */
+
+/* ===== Inline Function Start for 3.168. UMAX16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief UMAX16 (SIMD 16-bit Unsigned Maximum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMAX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] >u Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMAX16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umax16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.168. UMAX16 ===== */
+
+/* ===== Inline Function Start for 3.169. UMIN8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MISC
+ * \brief UMIN8 (SIMD 8-bit Unsigned Minimum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMIN8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 8-bit unsigned integer elements in Rs1 with the 8-bit
+ * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] <u Rs2.B[x])? Rs1.B[x] : Rs2.B[x];
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMIN8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umin8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.169. UMIN8 ===== */
+
+/* ===== Inline Function Start for 3.170. UMIN16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MISC
+ * \brief UMIN16 (SIMD 16-bit Unsigned Minimum)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMIN16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] <u Rs2.H[x])? Rs1.H[x] : Rs2.H[x];
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMIN16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umin16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.170. UMIN16 ===== */
+
+/* ===== Inline Function Start for 3.171. UMSR64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_MULT_64B_ADDSUB
+ * \brief UMSR64 (Unsigned Multiply and Subtract from 64-Bit Data)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMSR64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the 32-bit unsigned elements in two registers and subtract the 64-bit
+ * multiplication results from the 64-bit unsigned data of a pair of registers (RV32) or a register (RV64).
+ * The result is written back to the pair of registers (RV32) or a register (RV64).
+ *
+ * **RV32 Description**:\n
+ * This instruction multiplies the 32-bit unsigned data of Rs1 with that of Rs2. It
+ * subtracts the 64-bit multiplication result from the 64-bit unsigned data of an even/odd pair of
+ * registers specified by Rd(4,1). The subtraction result is written back to the even/odd pair of registers
+ * specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction multiplies the 32-bit unsigned elements of Rs1 with that of Rs2.
+ * It subtracts the 64-bit multiplication results from the 64-bit unsigned data of Rd. The subtraction
+ * result is written back to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].R[t_L] = R[t_H].R[t_L] - (Rs1 * Rs2);
+ * * RV64:
+ * Rd = Rd - (Rs1.W[0] u* Rs2.W[0]) - (Rs1.W[1] u* Rs2.W[1]);
+ * ~~~
+ *
+ * \param [in]  t    unsigned long long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMSR64(unsigned long long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("umsr64 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 3.171. UMSR64 ===== */
+
+/* ===== Inline Function Start for 3.172.1. UMUL8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief UMUL8 (SIMD Unsigned 8-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMUL8 Rd, Rs1, Rs2
+ * UMULX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do unsigned 8-bit multiplications and generate four 16-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
+ * with the corresponding unsigned 8-bit data elements of Rs2.
+ * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
+ * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
+ * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
+ * elements of Rs2.
+ * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
+ * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
+ * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
+ * part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
+ * with the corresponding unsigned 8-bit data elements of Rs2.
+ * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
+ * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
+ * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
+ * elements of Rs2.
+ * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
+ * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
+ * the bottom part of Rs1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `UMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `UMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] u* op2t[x/2];
+ * resb[x/2] = op1b[x/2] u* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
+ * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
+ * x = 0 and 2
+ * * RV64:
+ * if (is `UMUL8`) {
+ *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *     op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `UMULX8`) {
+ *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *     op1b[x/2]  =  Rs1.B[x]; op2b[x/2]  =  Rs2.B[x+1];  //  Rs1  bottom
+ * }
+ * rest[x/2]  =  op1t[x/2]  u*  op2t[x/2];
+ * resb[x/2]  =  op1b[x/2]  u*  op2b[x/2];
+ * t_L  =  CONCAT(Rd(4,1),1'b0); t_H  =  CONCAT(Rd(4,1),1'b1);
+ * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
+ * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0]; x = 0 and 2
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMUL8(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("umul8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.172.1. UMUL8 ===== */
+
+/* ===== Inline Function Start for 3.172.2. UMULX8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_MULTIPLY
+ * \brief UMULX8 (SIMD Unsigned Crossed 8-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMUL8 Rd, Rs1, Rs2
+ * UMULX8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do unsigned 8-bit multiplications and generate four 16-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
+ * with the corresponding unsigned 8-bit data elements of Rs2.
+ * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
+ * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
+ * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
+ * elements of Rs2.
+ * The four 16-bit results are then written into an even/odd pair of registers specified by Rd(4,1).
+ * Rd(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the two 16-bit results calculated from the top part of
+ * Rs1 and the even `2d` register of the pair contains the two 16-bit results calculated from the bottom
+ * part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `UMUL8` instruction, multiply the unsigned 8-bit data elements of Rs1
+ * with the corresponding unsigned 8-bit data elements of Rs2.
+ * For the `UMULX8` instruction, multiply the first and second unsigned 8-bit data elements of Rs1
+ * with the second and first unsigned 8-bit data elements of Rs2. At the same time, multiply the third
+ * and fourth unsigned 8-bit data elements of Rs1 with the fourth and third unsigned 8-bit data
+ * elements of Rs2.
+ * The four 16-bit results are then written into Rd. The Rd.W[1] contains the two 16-bit results
+ * calculated from the top part of Rs1 and the Rd.W[0] contains the two 16-bit results calculated from
+ * the bottom part of Rs1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `UMUL8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `UMULX8`) {
+ *   op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *   op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x+1]; // Rs1 bottom
+ * }
+ * rest[x/2] = op1t[x/2] u* op2t[x/2];
+ * resb[x/2] = op1b[x/2] u* op2b[x/2];
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H].H[1] = rest[1]; R[t_H].H[0] = resb[1];
+ * R[t_L].H[1] = rest[0]; R[t_L].H[0] = resb[0];
+ * x = 0 and 2
+ * * RV64:
+ * if (is `UMUL8`) {
+ *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x+1]; // top
+ *     op1b[x/2] = Rs1.B[x]; op2b[x/2] = Rs2.B[x]; // bottom
+ * } else if (is `UMULX8`) {
+ *     op1t[x/2] = Rs1.B[x+1]; op2t[x/2] = Rs2.B[x]; // Rs1 top
+ *     op1b[x/2]  =  Rs1.B[x]; op2b[x/2]  =  Rs2.B[x+1];  //  Rs1  bottom
+ * }
+ * rest[x/2]  =  op1t[x/2]  u*  op2t[x/2];
+ * resb[x/2]  =  op1b[x/2]  u*  op2b[x/2];
+ * t_L  =  CONCAT(Rd(4,1),1'b0); t_H  =  CONCAT(Rd(4,1),1'b1);
+ * Rd.W[1].H[1] = rest[1]; Rd.W[1].H[0] = resb[1];
+ * Rd.W[0].H[1] = rest[0]; Rd.W[0].H[0] = resb[0]; x = 0 and 2
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMULX8(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("umulx8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.172.2. UMULX8 ===== */
+
+/* ===== Inline Function Start for 3.173.1. UMUL16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief UMUL16 (SIMD Unsigned 16-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMUL16 Rd, Rs1, Rs2
+ * UMULX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do unsigned 16-bit multiplications and generate two 32-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `UMUL16` instruction, multiply the top 16-bit U16 content of Rs1 with
+ * the top 16-bit U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1
+ * with the bottom 16-bit U16 content of Rs2.
+ * For the `UMULX16` instruction, multiply the top 16-bit U16 content of Rs1 with the bottom 16-bit
+ * U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1 with the top 16-
+ * bit U16 content of Rs2.
+ * The two U32 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
+ * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
+ * register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
+ * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `UMUL16` instruction, multiply the top 16-bit U16 content of the lower
+ * 32-bit word in Rs1 with the top 16-bit U16 content of the lower 32-bit word in Rs2. At the same time,
+ * multiply the bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the bottom 16-bit U16
+ * content of the lower 32-bit word in Rs2.
+ * For the `UMULX16` instruction, multiply the top 16-bit U16 content of the lower 32-bit word in Rs1
+ * with the bottom 16-bit U16 content of the lower 32-bit word in Rs2. At the same time, multiply the
+ * bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the top 16-bit U16 content of the
+ * lower 32-bit word in Rs2.
+ * The two 32-bit U32 results are then written into Rd. The result calculated from the top 16-bit of the
+ * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
+ * the lower 32-bit word in Rs1 is written to Rd.W[0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `UMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `UMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop u* bop;
+ * }
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H] = rest;
+ * R[t_L] = resb;
+ * * RV64:
+ * if (is `UMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `UMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop u* bop;
+ * }
+ * Rd.W[1] = rest;
+ * Rd.W[0] = resb;
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMUL16(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("umul16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.173.1. UMUL16 ===== */
+
+/* ===== Inline Function Start for 3.173.2. UMULX16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_MULTIPLY
+ * \brief UMULX16 (SIMD Unsigned Crossed 16-bit Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMUL16 Rd, Rs1, Rs2
+ * UMULX16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do unsigned 16-bit multiplications and generate two 32-bit results simultaneously.
+ *
+ * **RV32 Description**:\n
+ * For the `UMUL16` instruction, multiply the top 16-bit U16 content of Rs1 with
+ * the top 16-bit U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1
+ * with the bottom 16-bit U16 content of Rs2.
+ * For the `UMULX16` instruction, multiply the top 16-bit U16 content of Rs1 with the bottom 16-bit
+ * U16 content of Rs2. At the same time, multiply the bottom 16-bit U16 content of Rs1 with the top 16-
+ * bit U16 content of Rs2.
+ * The two U32 results are then written into an even/odd pair of registers specified by Rd(4,1). Rd(4,1),
+ * i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair includes
+ * register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the 32-bit result calculated from the top part of Rs1 and
+ * the even `2d` register of the pair contains the 32-bit result calculated from the bottom part of Rs1.
+ *
+ * **RV64 Description**:\n
+ * For the `UMUL16` instruction, multiply the top 16-bit U16 content of the lower
+ * 32-bit word in Rs1 with the top 16-bit U16 content of the lower 32-bit word in Rs2. At the same time,
+ * multiply the bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the bottom 16-bit U16
+ * content of the lower 32-bit word in Rs2.
+ * For the `UMULX16` instruction, multiply the top 16-bit U16 content of the lower 32-bit word in Rs1
+ * with the bottom 16-bit U16 content of the lower 32-bit word in Rs2. At the same time, multiply the
+ * bottom 16-bit U16 content of the lower 32-bit word in Rs1 with the top 16-bit U16 content of the
+ * lower 32-bit word in Rs2.
+ * The two 32-bit U32 results are then written into Rd. The result calculated from the top 16-bit of the
+ * lower 32-bit word in Rs1 is written to Rd.W[1]. And the result calculated from the bottom 16-bit of
+ * the lower 32-bit word in Rs1 is written to Rd.W[0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * if (is `UMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `UMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop u* bop;
+ * }
+ * t_L = CONCAT(Rd(4,1),1'b0); t_H = CONCAT(Rd(4,1),1'b1);
+ * R[t_H] = rest;
+ * R[t_L] = resb;
+ * * RV64:
+ * if (is `UMUL16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[1]; // top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[0]; // bottom
+ * } else if (is `UMULX16`) {
+ *   op1t = Rs1.H[1]; op2t = Rs2.H[0]; // Rs1 top
+ *   op1b = Rs1.H[0]; op2b = Rs2.H[1]; // Rs1 bottom
+ * }
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   res = aop u* bop;
+ * }
+ * Rd.W[1] = rest;
+ * Rd.W[0] = resb;
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_UMULX16(unsigned int a, unsigned int b)
+{
+    register unsigned long long result;
+    __ASM volatile("umulx16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.173.2. UMULX16 ===== */
+
+/* ===== Inline Function Start for 3.174. URADD8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief URADD8 (SIMD 8-bit Unsigned Halving Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URADD8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer element additions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit unsigned integer elements in Rs1 with the 8-bit
+ * unsigned integer elements in Rs2. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7F, Rb = 0x7F, Rt = 0x7F
+ * * Ra = 0x80, Rb = 0x80, Rt = 0x80
+ * * Ra = 0x40, Rb = 0x80, Rt = 0x60
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] + Rs2.B[x]) u>> 1;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URADD8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uradd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.174. URADD8 ===== */
+
+/* ===== Inline Function Start for 3.175. URADD16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URADD16 (SIMD 16-bit Unsigned Halving Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URADD16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element additions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer elements in Rs1 with the 16-bit
+ * unsigned integer elements in Rs2. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFF, Rb = 0x7FFF Rt = 0x7FFF
+ * * Ra = 0x8000, Rb = 0x8000 Rt = 0x8000
+ * * Ra = 0x4000, Rb = 0x8000 Rt = 0x6000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] + Rs2.H[x]) u>> 1;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URADD16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uradd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.175. URADD16 ===== */
+
+/* ===== Inline Function Start for 3.176. URADD64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief URADD64 (64-bit Unsigned Halving Addition)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URADD64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add two 64-bit unsigned integers. The result is halved to avoid overflow or saturation.
+ *
+ * **RV32 Description**:\n
+ * This instruction adds the 64-bit unsigned integer of an even/odd pair of registers
+ * specified by Rs1(4,1) with the 64-bit unsigned integer of an even/odd pair of registers specified by
+ * Rs2(4,1). The 64-bit addition result is first logically right-shifted by 1 bit and then written to an
+ * even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction adds the 64-bit unsigned integer in Rs1 with the 64-bit unsigned
+ * integer Rs2. The 64-bit addition result is first logically right-shifted by 1 bit and then written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1);
+ * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1);
+ * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1);
+ * R[t_H].R[t_L] = (R[a_H].R[a_L] + R[b_H].R[b_L]) u>> 1;
+ * * RV64:
+ * Rd = (Rs1 + Rs2) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_URADD64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("uradd64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.176. URADD64 ===== */
+
+/* ===== Inline Function Start for 3.177. URADDW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief URADDW (32-bit Unsigned Halving Addition)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URADDW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Add 32-bit unsigned integers and the results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the first 32-bit unsigned integer in Rs1 with the first 32-bit
+ * unsigned integer in Rs2. The result is first logically right-shifted by 1 bit and then sign-extended and
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFFFFFF, Rb = 0x7FFFFFFF Rt = 0x7FFFFFFF
+ * * Ra = 0x80000000, Rb = 0x80000000 Rt = 0x80000000
+ * * Ra = 0x40000000, Rb = 0x80000000 Rt = 0x60000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Rd[31:0] = (Rs1[31:0] + Rs2[31:0]) u>> 1;
+ * * RV64:
+ * resw[31:0] = (Rs1[31:0] + Rs2[31:0]) u>> 1;
+ * Rd[63:0] = SE(resw[31:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URADDW(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("uraddw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.177. URADDW ===== */
+
+/* ===== Inline Function Start for 3.178. URCRAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URCRAS16 (SIMD 16-bit Unsigned Halving Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URCRAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element addition and 16-bit unsigned integer element
+ * subtraction in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
+ * The results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1
+ * with the 16-bit unsigned integer in [15:0] of 32-bit chunks in Rs2, and subtracts the 16-bit unsigned
+ * integer in [31:16] of 32-bit chunks in Rs2 from the 16-bit unsigned integer in [15:0] of 32-bit chunks
+ * in Rs1. The element results are first logically right-shifted by 1 bit and then written to [31:16] of 32-
+ * bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD16` and `URSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][15:0]) u>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][31:16]) u>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URCRAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urcras16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.178. URCRAS16 ===== */
+
+/* ===== Inline Function Start for 3.179. URCRSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URCRSA16 (SIMD 16-bit Unsigned Halving Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URCRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element subtraction and 16-bit unsigned integer element
+ * addition in a 32-bit chunk simultaneously. Operands are from crossed positions in 32-bit chunks.
+ * The results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer in [15:0] of 32-bit chunks in Rs2
+ * from the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit unsigned
+ * integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit unsigned integer in [31:16] of 32-bit chunks
+ * in Rs2. The two results are first logically right-shifted by 1 bit and then written to [31:16] of 32-bit
+ * chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD16` and `URSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][15:0]) u>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][31:16]) u>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URCRSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urcrsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.179. URCRSA16 ===== */
+
+/* ===== Inline Function Start for 3.180. URSTAS16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URSTAS16 (SIMD 16-bit Unsigned Halving Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSTAS16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element addition and 16-bit unsigned integer element
+ * subtraction in a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit
+ * chunks. The results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1
+ * with the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs2, and subtracts the 16-bit unsigned
+ * integer in [15:0] of 32-bit chunks in Rs2 from the 16-bit unsigned integer in [15:0] of 32-bit chunks
+ * in Rs1. The element results are first logically right-shifted by 1 bit and then written to [31:16] of 32-
+ * bit chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD16` and `URSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] + Rs2.W[x][31:16]) u>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] - Rs2.W[x][15:0]) u>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSTAS16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urstas16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.180. URSTAS16 ===== */
+
+/* ===== Inline Function Start for 3.181. URSTSA16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URSTSA16 (SIMD 16-bit Unsigned Halving Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URCRSA16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element subtraction and 16-bit unsigned integer element
+ * addition in a 32-bit chunk simultaneously. Operands are from corresponding positions in 32-bit
+ * chunks. The results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs2
+ * from the 16-bit unsigned integer in [31:16] of 32-bit chunks in Rs1, and adds the 16-bit unsigned
+ * integer in [15:0] of 32-bit chunks in Rs1 with the 16-bit unsigned integer in [15:0] of 32-bit chunks in
+ * Rs2. The two results are first logically right-shifted by 1 bit and then written to [31:16] of 32-bit
+ * chunks in Rd and [15:0] of 32-bit chunks in Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD16` and `URSUB16` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:16] = (Rs1.W[x][31:16] - Rs2.W[x][31:16]) u>> 1;
+ * Rd.W[x][15:0] = (Rs1.W[x][15:0] + Rs2.W[x][15:0]) u>> 1;
+ * for RV32, x=0
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSTSA16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urstsa16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.181. URSTSA16 ===== */
+
+/* ===== Inline Function Start for 3.182. URSUB8 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_ADDSUB
+ * \brief URSUB8 (SIMD 8-bit Unsigned Halving Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSUB8 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit unsigned integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit unsigned integer elements in Rs2 from the 8-bit
+ * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7F, Rb = 0x80 Rt = 0xFF
+ * * Ra = 0x80, Rb = 0x7F Rt = 0x00
+ * * Ra = 0x80, Rb = 0x40 Rt = 0x20
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.B[x] = (Rs1.B[x] - Rs2.B[x]) u>> 1;
+ * for RV32: x=3...0,
+ * for RV64: x=7...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSUB8(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ursub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.182. URSUB8 ===== */
+
+/* ===== Inline Function Start for 3.183. URSUB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_16B_ADDSUB
+ * \brief URSUB16 (SIMD 16-bit Unsigned Halving Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSUB16 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit unsigned integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit unsigned integer elements in Rs2 from the 16-bit
+ * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFF, Rb = 0x8000 Rt = 0xFFFF
+ * * Ra = 0x8000, Rb = 0x7FFF Rt = 0x0000
+ * * Ra = 0x8000, Rb = 0x4000 Rt = 0x2000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.H[x] = (Rs1.H[x] - Rs2.H[x]) u>> 1;
+ * for RV32: x=1...0,
+ * for RV64: x=3...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSUB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ursub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.183. URSUB16 ===== */
+
+/* ===== Inline Function Start for 3.184. URSUB64 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_64B_ADDSUB
+ * \brief URSUB64 (64-bit Unsigned Halving Subtraction)
+ * \details
+ * **Type**: DSP (64-bit Profile)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSUB64 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 64-bit unsigned integer subtraction. The result is halved to avoid overflow or
+ * saturation.
+ *
+ * **RV32 Description**:\n
+ * This instruction subtracts the 64-bit unsigned integer of an even/odd pair of
+ * registers specified by Rs2(4,1) from the 64-bit unsigned integer of an even/odd pair of registers
+ * specified by Rs1(4,1). The subtraction result is first logically right-shifted by 1 bit and then written
+ * to an even/odd pair of registers specified by Rd(4,1).
+ * Rx(4,1), i.e., d, determines the even/odd pair group of two registers. Specifically, the register pair
+ * includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the result and the even `2d` register
+ * of the pair contains the low 32-bit of the result.
+ *
+ * **RV64 Description**:\n
+ * This instruction subtracts the 64-bit unsigned integer in Rs2 from the 64-bit
+ * unsigned integer in Rs1. The subtraction result is first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * t_L = CONCAT(Rt(4,1),1'b0); t_H = CONCAT(Rt(4,1),1'b1);
+ * a_L = CONCAT(Ra(4,1),1'b0); a_H = CONCAT(Ra(4,1),1'b1);
+ * b_L = CONCAT(Rb(4,1),1'b0); b_H = CONCAT(Rb(4,1),1'b1);
+ * R[t_H].R[t_L] = (R[a_H].R[a_L] - R[b_H].R[b_L]) u>> 1;
+ * * RV64:
+ * Rd = (Rs1 - Rs2) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long long type of value stored in a
+ * \param [in]  b    unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_URSUB64(unsigned long long a, unsigned long long b)
+{
+    register unsigned long long result;
+    __ASM volatile("ursub64 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.184. URSUB64 ===== */
+
+/* ===== Inline Function Start for 3.185. URSUBW ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_32B_COMPUTATION
+ * \brief URSUBW (32-bit Unsigned Halving Subtraction)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSUBW Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Subtract 32-bit unsigned integers and the result is halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the first 32-bit signed integer in Rs2 from the first 32-bit
+ * signed integer in Rs1. The result is first logically right-shifted by 1 bit and then sign-extended and
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFFFFFF, Rb = 0x80000000 Rt = 0xFFFFFFFF
+ * * Ra = 0x80000000, Rb = 0x7FFFFFFF Rt = 0x00000000
+ * * Ra = 0x80000000, Rb = 0x40000000 Rt = 0x20000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Rd[31:0] = (Rs1[31:0] - Rs2[31:0]) u>> 1;
+ * * RV64:
+ * resw[31:0] = (Rs1[31:0] - Rs2[31:0]) u>> 1;
+ * Rd[63:0] = SE(resw[31:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSUBW(unsigned int a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ursubw %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.185. URSUBW ===== */
+
+/* ===== Inline Function Start for 3.186. WEXTI ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief WEXTI (Extract Word from 64-bit Immediate)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * WEXTI Rd, Rs1, #LSBloc
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Extract a 32-bit word from a 64-bit value stored in an even/odd pair of registers (RV32) or
+ * a register (RV64) starting from a specified immediate LSB bit position.
+ *
+ * **RV32 Description**:\n
+ * This instruction extracts a 32-bit word from a 64-bit value of an even/odd pair of registers specified
+ * by Rs1(4,1) starting from a specified immediate LSB bit position, #LSBloc. The extracted word is
+ * written to Rd.
+ * Rs1(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the 64-bit value and the even `2d`
+ * register of the pair contains the low 32-bit of the 64-bit value.
+ *
+ * **RV64 Description**:\n
+ * This instruction extracts a 32-bit word from a 64-bit value in Rs1 starting from a specified
+ * immediate LSB bit position, #LSBloc. The extracted word is sign-extended and written to lower 32-
+ * bit of Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs2(4,1),1'b1);
+ * src[63:0] = Concat(R[Idx1], R[Idx0]);
+ * Rd = src[31+LSBloc:LSBloc];
+ * * RV64:
+ * ExtractW = Rs1[31+LSBloc:LSBloc];
+ * Rd = SE(ExtractW)
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+#define __RV_WEXTI(a, b)    \
+    ({    \
+        register unsigned long result;    \
+        register long long __a = (long long)(a);    \
+        __ASM volatile("wexti %0, %1, %2" : "=r"(result) : "r"(__a), "K"(b));    \
+        result;    \
+    })
+/* ===== Inline Function End for 3.186. WEXTI ===== */
+
+/* ===== Inline Function Start for 3.187. WEXT ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_NON_SIMD_MISC
+ * \brief WEXT (Extract Word from 64-bit)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * WEXT Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Extract a 32-bit word from a 64-bit value stored in an even/odd pair of registers (RV32) or
+ * a register (RV64) starting from a specified LSB bit position in a register.
+ *
+ * **RV32 Description**:\n
+ * This instruction extracts a 32-bit word from a 64-bit value of an even/odd pair of registers specified
+ * by Rs1(4,1) starting from a specified LSB bit position, specified in Rs2[4:0]. The extracted word is
+ * written to Rd.
+ * Rs1(4,1), i.e., d, determines the even/odd pair group of the two registers. Specifically, the register
+ * pair includes register 2d and 2d+1.
+ * The odd `2d+1` register of the pair contains the high 32-bit of the 64-bit value and the even `2d`
+ * register of the pair contains the low 32-bit of the 64-bit value.
+ *
+ * **Operations**:\n
+ * ~~~
+ * * RV32:
+ * Idx0 = CONCAT(Rs1(4,1),1'b0); Idx1 = CONCAT(Rs1(4,1),1'b1);
+ * src[63:0] = Concat(R[Idx1], R[Idx0]);
+ * LSBloc = Rs2[4:0];
+ * Rd = src[31+LSBloc:LSBloc];
+ * * RV64:
+ * LSBloc = Rs2[4:0];
+ * ExtractW = Rs1[31+LSBloc:LSBloc];
+ * Rd = SE(ExtractW)
+ * ~~~
+ *
+ * \param [in]  a    long long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_WEXT(long long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("wext %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 3.187. WEXT ===== */
+
+/* ===== Inline Function Start for 3.188.1. ZUNPKD810 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief ZUNPKD810 (Unsigned Unpacking Bytes 1 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ZUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
+ * halfwords of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
+ * // ZUNPKD810, x=1,y=0
+ * // ZUNPKD820, x=2,y=0
+ * // ZUNPKD830, x=3,y=0
+ * // ZUNPKD831, x=3,y=1
+ * // ZUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD810(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("zunpkd810 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.188.1. ZUNPKD810 ===== */
+
+/* ===== Inline Function Start for 3.188.2. ZUNPKD820 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief ZUNPKD820 (Unsigned Unpacking Bytes 2 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ZUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
+ * halfwords of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
+ * // ZUNPKD810, x=1,y=0
+ * // ZUNPKD820, x=2,y=0
+ * // ZUNPKD830, x=3,y=0
+ * // ZUNPKD831, x=3,y=1
+ * // ZUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD820(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("zunpkd820 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.188.2. ZUNPKD820 ===== */
+
+/* ===== Inline Function Start for 3.188.3. ZUNPKD830 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief ZUNPKD830 (Unsigned Unpacking Bytes 3 & 0)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ZUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
+ * halfwords of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
+ * // ZUNPKD810, x=1,y=0
+ * // ZUNPKD820, x=2,y=0
+ * // ZUNPKD830, x=3,y=0
+ * // ZUNPKD831, x=3,y=1
+ * // ZUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD830(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("zunpkd830 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.188.3. ZUNPKD830 ===== */
+
+/* ===== Inline Function Start for 3.188.4. ZUNPKD831 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief ZUNPKD831 (Unsigned Unpacking Bytes 3 & 1)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ZUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
+ * halfwords of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
+ * // ZUNPKD810, x=1,y=0
+ * // ZUNPKD820, x=2,y=0
+ * // ZUNPKD830, x=3,y=0
+ * // ZUNPKD831, x=3,y=1
+ * // ZUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD831(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("zunpkd831 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.188.4. ZUNPKD831 ===== */
+
+/* ===== Inline Function Start for 3.188.5. ZUNPKD832 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_SIMD_8B_UNPACK
+ * \brief ZUNPKD832 (Unsigned Unpacking Bytes 3 & 2)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ZUNPKD8xy Rd, Rs1
+ * xy = {10, 20, 30, 31, 32}
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Unpack byte x and byte y of 32-bit chunks in a register into two 16-bit unsigned
+ * halfwords of 32-bit chunks in a register.
+ *
+ * **Description**:\n
+ * For the `ZUNPKD8(x)(*y*)` instruction, it unpacks byte *x and byte y* of 32-bit chunks in Rs1 into
+ * two 16-bit unsigned halfwords and writes the results to the top part and the bottom part of 32-bit
+ * chunks in Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[m].H[1] = ZE16(Rs1.W[m].B[x])
+ * Rd.W[m].H[0] = ZE16(Rs1.W[m].B[y])
+ * // ZUNPKD810, x=1,y=0
+ * // ZUNPKD820, x=2,y=0
+ * // ZUNPKD830, x=3,y=0
+ * // ZUNPKD831, x=3,y=1
+ * // ZUNPKD832, x=3,y=2
+ * for RV32: m=0,
+ * for RV64: m=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ZUNPKD832(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("zunpkd832 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 3.188.5. ZUNPKD832 ===== */
+
+#if (__RISCV_XLEN == 64) || defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__)
+
+/* ===== Inline Function Start for 4.1. ADD32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief ADD32 (SIMD 32-bit Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * ADD32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit integer elements in Rs1 with the 32-bit integer
+ * elements in Rs2, and then writes the 32-bit element results to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = Rs1.W[x] + Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_ADD32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("add32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.1. ADD32 ===== */
+
+/* ===== Inline Function Start for 4.2. CRAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief CRAS32 (SIMD 32-bit Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CRAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element addition and 32-bit integer element subtraction in a 64-bit
+ * chunk simultaneously. Operands are from crossed 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
+ * integer element in [31:0] of Rs2, and writes the result to [63:32] of Rd; at the same time, it subtracts
+ * the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer element in [31:0] of Rs1, and
+ * writes the result to [31:0] of Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = Rs1.W[1] + Rs2.W[0];
+ * Rd.W[0] = Rs1.W[0] - Rs2.W[1];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CRAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("cras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.2. CRAS32 ===== */
+
+/* ===== Inline Function Start for 4.3. CRSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief CRSA32 (SIMD 32-bit Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * CRSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element subtraction and 32-bit integer element addition in a 64-bit
+ * chunk simultaneously. Operands are from crossed 32-bit elements.
+ * *Description: *
+ * This instruction subtracts the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element
+ * in [63:32] of Rs1, and writes the result to [63:32] of Rd; at the same time, it adds the 32-bit integer
+ * element in [31:0] of Rs1 with the 32-bit integer element in [63:32] of Rs2, and writes the result to
+ * [31:0] of Rd
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = Rs1.W[1] - Rs2.W[0];
+ * Rd.W[0] = Rs1.W[0] + Rs2.W[1];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_CRSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("crsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.3. CRSA32 ===== */
+
+/* ===== Inline Function Start for 4.4. KABS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
+ * \brief KABS32 (Scalar 32-bit Absolute Value with Saturation)
+ * \details
+ * **Type**: DSP (RV64 Only)
+24    20
+19    15
+14    12
+11    7
+KABS32
+10010
+Rs1
+000
+Rd
+6    0
+GE80B
+1111111
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KABS32 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of signed 32-bit integer elements in a general register.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of signed 32-bit integer elements stored
+ * in Rs1. The results are written to Rd. This instruction with the minimum negative integer input of
+ * 0x80000000 will produce a saturated output of maximum positive integer of 0x7fffffff and the OV
+ * flag will be set to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs1.W[x] >= 0) {
+ *   res[x] = Rs1.W[x];
+ * } else {
+ *   If (Rs1.W[x] == 0x80000000) {
+ *     res[x] = 0x7fffffff;
+ *     OV = 1;
+ *   } else {
+ *     res[x] = -Rs1.W[x];
+ *   }
+ * }
+ * Rd.W[x] = res[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KABS32(unsigned long a)
+{
+    register unsigned long result;
+    __ASM volatile("kabs32 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for 4.4. KABS32 ===== */
+
+/* ===== Inline Function Start for 4.5. KADD32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KADD32 (SIMD 32-bit Signed Saturating Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KADD32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit signed integer elements in Rs1 with the 32-bit signed
+ * integer elements in Rs2. If any of the results are beyond the Q31 number range (-2^31 <= Q31 <= 2^31-1),
+ * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.W[x] + Rs2.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KADD32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kadd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.5. KADD32 ===== */
+
+/* ===== Inline Function Start for 4.6. KCRAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KCRAS32 (SIMD 32-bit Signed Saturating Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIM (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KCRAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element saturating addition and 32-bit signed integer element
+ * saturating subtraction in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
+ * integer element in [31:0] of Rs2; at the same time, it subtracts the 32-bit integer element in [63:32] of
+ * Rs2 from the 32-bit integer element in [31:0] of Rs1. If any of the results are beyond the Q31 number
+ * range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to [63:32] of Rd for addition and [31:0] of Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[1] = Rs1.W[1] + Rs2.W[0];
+ * res[0] = Rs1.W[0] - Rs2.W[1];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res[1];
+ * Rd.W[0] = res[0];
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KCRAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.6. KCRAS32 ===== */
+
+/* ===== Inline Function Start for 4.7. KCRSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KCRSA32 (SIMD 32-bit Signed Saturating Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KCRSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element saturating subtraction and 32-bit signed integer element
+ * saturating addition in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements.
+ * *Description: *
+ * This instruction subtracts the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element
+ * in [63:32] of Rs1; at the same time, it adds the 32-bit integer element in [31:0] of Rs1 with the 32-bit
+ * integer element in [63:32] of Rs2. If any of the results are beyond the Q31 number range (-2^31 <= Q31
+ * <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
+ * [63:32] of Rd for subtraction and [31:0] of Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[1] = Rs1.W[1] - Rs2.W[0];
+ * res[0] = Rs1.W[0] + Rs2.W[1];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res[1];
+ * Rd.W[0] = res[0];
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KCRSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.7. KCRSA32 ===== */
+
+/* ===== Inline Function Start for 4.8.1. KDMBB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMBB16 (SIMD Signed Saturating Double Multiply B16 x B16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks
+ * in the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and
+ * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both
+ * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF
+ * and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resQ31[z];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMBB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kdmbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.8.1. KDMBB16 ===== */
+
+/* ===== Inline Function Start for 4.8.2. KDMBT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMBT16 (SIMD Signed Saturating Double Multiply B16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks
+ * in the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and
+ * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both
+ * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF
+ * and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resQ31[z];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMBT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kdmbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.8.2. KDMBT16 ===== */
+
+/* ===== Inline Function Start for 4.8.3. KDMTT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMTT16 (SIMD Signed Saturating Double Multiply T16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results into the 32-bit chunks
+ * in the destination register. If saturation happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portions in Rs2. The Q30 results are then doubled and
+ * saturated into Q31 values. The Q31 values are then written into the 32-bit chunks in Rd. When both
+ * the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated to 0x7FFFFFFF
+ * and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resQ31[z];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMTT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kdmtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.8.3. KDMTT16 ===== */
+
+/* ===== Inline Function Start for 4.9.1. KDMABB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMABB16 (SIMD Signed Saturating Double Multiply Addition B16 x B16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with
+ * the values of the corresponding 32-bit chunks from the destination register and write the saturated
+ * addition results back into the corresponding 32-bit chunks of the destination register. If saturation
+ * happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then
+ * doubled and saturated into Q31 values. The Q31 values are then added with the content of the
+ * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <=
+ * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation
+ * are written back to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd[z] = Rd.W[z] + resQ31[z];
+ * if (resadd[z] > (2^31)-1) {
+ *   resadd[z] = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd[z] < -2^31) {
+ *   resadd[z] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resadd[z];
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMABB16(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kdmabb16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.9.1. KDMABB16 ===== */
+
+/* ===== Inline Function Start for 4.9.2. KDMABT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMABT16 (SIMD Signed Saturating Double Multiply Addition B16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with
+ * the values of the corresponding 32-bit chunks from the destination register and write the saturated
+ * addition results back into the corresponding 32-bit chunks of the destination register. If saturation
+ * happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then
+ * doubled and saturated into Q31 values. The Q31 values are then added with the content of the
+ * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <=
+ * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation
+ * are written back to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd[z] = Rd.W[z] + resQ31[z];
+ * if (resadd[z] > (2^31)-1) {
+ *   resadd[z] = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd[z] < -2^31) {
+ *   resadd[z] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resadd[z];
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMABT16(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kdmabt16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.9.2. KDMABT16 ===== */
+
+/* ===== Inline Function Start for 4.9.3. KDMATT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KDMATT16 (SIMD Signed Saturating Double Multiply Addition T16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KDMAxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then double and saturate the Q31 results, add the results with
+ * the values of the corresponding 32-bit chunks from the destination register and write the saturated
+ * addition results back into the corresponding 32-bit chunks of the destination register. If saturation
+ * happens, an overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the corresponding 32-bit portions in Rs2. The Q30 results are then
+ * doubled and saturated into Q31 values. The Q31 values are then added with the content of the
+ * corresponding 32-bit portions of Rd. If the addition results are beyond the Q31 number range (-2^31 <=
+ * Q31 <= 2^31-1), they are saturated to the range and the OV flag is set to 1. The results after saturation
+ * are written back to Rd.
+ * When both the two Q15 inputs are 0x8000, saturation will happen and the overflow flag OV will be
+ * set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KDMABB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KDMABT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KDMATT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop[z] = Rs1.H[x]; bop[z] = Rs2.H[y];
+ * If (0x8000 != aop[z] | 0x8000 != bop[z]) {
+ *   Mresult[z] = aop[z] * bop[z];
+ *   resQ31[z] = Mresult[z] << 1;
+ * } else {
+ *   resQ31[z] = 0x7FFFFFFF;
+ *   OV = 1;
+ * }
+ * resadd[z] = Rd.W[z] + resQ31[z];
+ * if (resadd[z] > (2^31)-1) {
+ *   resadd[z] = (2^31)-1;
+ *   OV = 1;
+ * } else if (resadd[z] < -2^31) {
+ *   resadd[z] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = resadd[z];
+ * ~~~
+ *
+ * \param [in]  t    unsigned long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KDMATT16(unsigned long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kdmatt16 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.9.3. KDMATT16 ===== */
+
+/* ===== Inline Function Start for 4.10.1. KHMBB16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KHMBB16 (SIMD Signed Saturating Half Multiply B16 x B16)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15
+ * numbers again and saturate the Q15 results into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15-
+ * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop = Rs1.H[x]; bop = Rs2.H[y];
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = SE32(res[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHMBB16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khmbb16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.10.1. KHMBB16 ===== */
+
+/* ===== Inline Function Start for 4.10.2. KHMBT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KHMBT16 (SIMD Signed Saturating Half Multiply B16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15
+ * numbers again and saturate the Q15 results into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15-
+ * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop = Rs1.H[x]; bop = Rs2.H[y];
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = SE32(res[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHMBT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khmbt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.10.2. KHMBT16 ===== */
+
+/* ===== Inline Function Start for 4.10.3. KHMTT16 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_Q15_SAT_MULT
+ * \brief KHMTT16 (SIMD Signed Saturating Half Multiply T16 x T16)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KHMxy16 Rd, Rs1, Rs2 (xy = BB, BT, TT)
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed Q15 integer contents of two 16-bit data in the corresponding portion
+ * of the 32-bit chunks in registers and then right-shift 15 bits to turn the Q30 results into Q15
+ * numbers again and saturate the Q15 results into the destination register. If saturation happens, an
+ * overflow flag OV will be set.
+ *
+ * **Description**:\n
+ * Multiply the top or bottom 16-bit Q15 content of the 32-bit portions in Rs1 with the top
+ * or bottom 16-bit Q15 content of the 32-bit portion in Rs2. The Q30 results are then right-shifted 15-
+ * bits and saturated into Q15 values. The 32-bit Q15 values are then written into the 32-bit chunks in
+ * Rd. When both the two Q15 inputs are 0x8000, saturation will happen. The result will be saturated
+ * to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * // KHMBB16: (x,y,z)=(0,0,0),(2,2,1)
+ * // KHMBT16: (x,y,z)=(0,1,0),(2,3,1)
+ * // KHMTT16: (x,y,z)=(1,1,0),(3,3,1)
+ * aop = Rs1.H[x]; bop = Rs2.H[y];
+ * If (0x8000 != aop | 0x8000 != bop) {
+ *   Mresult[31:0] = aop * bop;
+ *   res[15:0] = Mresult[30:15];
+ * } else {
+ *   res[15:0] = 0x7FFF;
+ *   OV = 1;
+ * }
+ * Rd.W[z] = SE32(res[15:0]);
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KHMTT16(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("khmtt16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.10.3. KHMTT16 ===== */
+
+/* ===== Inline Function Start for 4.11.1. KMABB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD
+ * \brief KMABB32 (Saturating Signed Multiply Bottom Words & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB32 Rd, Rs1, Rs2
+ * KMABT32 Rd, Rs1, Rs2
+ * KMATT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element in a register with the 32-bit element in another register
+ * and add the result to the content of 64-bit data in the third register. The addition result may be
+ * saturated and is written to the third register.
+ * * KMABB32: rd + bottom*bottom
+ * * KMABT32: rd + bottom*top
+ * * KMATT32: rd + top*top
+ *
+ * **Description**:\n
+ * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2.
+ * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond
+ * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The
+ * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32
+ *  res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32
+ *  res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32
+ *  if (res > (2^63)-1) {
+ *    res = (2^63)-1;
+ *    OV = 1;
+ *  } else if (res < -2^63) {
+ *    res = -2^63;
+ *    OV = 1;
+ *  }
+ *  Rd = res;
+ * *Exceptions:* None
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMABB32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmabb32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.11.1. KMABB32 ===== */
+
+/* ===== Inline Function Start for 4.11.2. KMABT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD
+ * \brief KMABT32 (Saturating Signed Multiply Bottom & Top Words & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB32 Rd, Rs1, Rs2
+ * KMABT32 Rd, Rs1, Rs2
+ * KMATT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element in a register with the 32-bit element in another register
+ * and add the result to the content of 64-bit data in the third register. The addition result may be
+ * saturated and is written to the third register.
+ * * KMABB32: rd + bottom*bottom
+ * * KMABT32: rd + bottom*top
+ * * KMATT32: rd + top*top
+ *
+ * **Description**:\n
+ * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2.
+ * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond
+ * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The
+ * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32
+ *  res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32
+ *  res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32
+ *  if (res > (2^63)-1) {
+ *    res = (2^63)-1;
+ *    OV = 1;
+ *  } else if (res < -2^63) {
+ *    res = -2^63;
+ *    OV = 1;
+ *  }
+ *  Rd = res;
+ * *Exceptions:* None
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMABT32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmabt32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.11.2. KMABT32 ===== */
+
+/* ===== Inline Function Start for 4.11.3. KMATT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT_ADD
+ * \brief KMATT32 (Saturating Signed Multiply Top Words & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMABB32 Rd, Rs1, Rs2
+ * KMABT32 Rd, Rs1, Rs2
+ * KMATT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element in a register with the 32-bit element in another register
+ * and add the result to the content of 64-bit data in the third register. The addition result may be
+ * saturated and is written to the third register.
+ * * KMABB32: rd + bottom*bottom
+ * * KMABT32: rd + bottom*top
+ * * KMATT32: rd + top*top
+ *
+ * **Description**:\n
+ * For the `KMABB32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2.
+ * For the `KMABT32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * For the `KMATT32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2.
+ * The multiplication result is added to the content of 64-bit data in Rd. If the addition result is beyond
+ * the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The
+ * result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]); // KMABB32
+ *  res = Rd + (Rs1.W[0] * Rs2.W[1]); // KMABT32
+ *  res = Rd + (Rs1.W[1] * Rs2.W[1]); // KMATT32
+ *  if (res > (2^63)-1) {
+ *    res = (2^63)-1;
+ *    OV = 1;
+ *  } else if (res < -2^63) {
+ *    res = -2^63;
+ *    OV = 1;
+ *  }
+ *  Rd = res;
+ * *Exceptions:* None
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMATT32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmatt32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.11.3. KMATT32 ===== */
+
+/* ===== Inline Function Start for 4.12.1. KMADA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMADA32 (Saturating Signed Multiply Two Words and Two Adds)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADA32 Rd, Rs1, Rs2
+ * KMAXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from 32-bit data in two registers; and then adds the
+ * two 64-bit results and 64-bit data in a third register together. The addition result may be saturated.
+ * * KMADA32: rd + top*top + bottom*bottom
+ * * KMAXDA32: rd + top*bottom + bottom*top
+ *
+ * **Description**:\n
+ * For the `KMADA32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-
+ * bit element in Rs2 and then adds the result to the result of multiplying the top 32-bit element in Rs1
+ * with the top 32-bit element in Rs2. It is actually an alias of the `KMAR64` instruction.
+ * For the `KMAXDA32` instruction, it multiplies the top 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2 and then adds the result to the result of multiplying the bottom 32-bit element in Rs1
+ * with the top 32-bit element in Rs2.
+ * The result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63
+ * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The 64-bit
+ * result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[1] * Rs2.w[1]) + (Rs1.W[0] * Rs2.W[0]); // KMADA32
+ * res = Rd + (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMAXDA32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADA32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmada32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.12.1. KMADA32 ===== */
+
+/* ===== Inline Function Start for 4.12.2. KMAXDA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMAXDA32 (Saturating Signed Crossed Multiply Two Words and Two Adds)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADA32 Rd, Rs1, Rs2
+ * KMAXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from 32-bit data in two registers; and then adds the
+ * two 64-bit results and 64-bit data in a third register together. The addition result may be saturated.
+ * * KMADA32: rd + top*top + bottom*bottom
+ * * KMAXDA32: rd + top*bottom + bottom*top
+ *
+ * **Description**:\n
+ * For the `KMADA32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-
+ * bit element in Rs2 and then adds the result to the result of multiplying the top 32-bit element in Rs1
+ * with the top 32-bit element in Rs2. It is actually an alias of the `KMAR64` instruction.
+ * For the `KMAXDA32` instruction, it multiplies the top 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2 and then adds the result to the result of multiplying the bottom 32-bit element in Rs1
+ * with the top 32-bit element in Rs2.
+ * The result is added to the content of 64-bit data in Rd. If the addition result is beyond the Q63
+ * number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to 1. The 64-bit
+ * result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[1] * Rs2.w[1]) + (Rs1.W[0] * Rs2.W[0]); // KMADA32
+ * res = Rd + (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMAXDA32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMAXDA32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmaxda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.12.2. KMAXDA32 ===== */
+
+/* ===== Inline Function Start for 4.13.1. KMDA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMDA32 (Signed Multiply Two Words and Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMDA32 Rd, Rs1, Rs2
+ * KMXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
+ * adds the two 64-bit results together. The addition result may be saturated.
+ * * KMDA32: top*top + bottom*bottom
+ * * KMXDA32: top*bottom + bottom*top
+ *
+ * **Description**:\n
+ * For the `KMDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
+ * with the top 32-bit element of Rs2.
+ * For the `KMXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
+ * with the bottom 32-bit element of Rs2.
+ * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^63-1.
+ * The final result is written to Rd. The 32-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1 != 0x8000000080000000) or (Rs2 != 0x8000000080000000)) {
+ *   Rd = (Rs1.W[1] * Rs2.W[1]) + (Rs1.W[0] * Rs2.W[0]); // KMDA32
+ *   Rd = (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMXDA32
+ * } else {
+ *   Rd = 0x7fffffffffffffff;
+ *   OV = 1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMDA32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmda32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.13.1. KMDA32 ===== */
+
+/* ===== Inline Function Start for 4.13.2. KMXDA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMXDA32 (Signed Crossed Multiply Two Words and Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMDA32 Rd, Rs1, Rs2
+ * KMXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
+ * adds the two 64-bit results together. The addition result may be saturated.
+ * * KMDA32: top*top + bottom*bottom
+ * * KMXDA32: top*bottom + bottom*top
+ *
+ * **Description**:\n
+ * For the `KMDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
+ * with the top 32-bit element of Rs2.
+ * For the `KMXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then adds the result to the result of multiplying the top 32-bit element of Rs1
+ * with the bottom 32-bit element of Rs2.
+ * The addition result is checked for saturation. If saturation happens, the result is saturated to 2^63-1.
+ * The final result is written to Rd. The 32-bit contents are treated as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if ((Rs1 != 0x8000000080000000) or (Rs2 != 0x8000000080000000)) {
+ *   Rd = (Rs1.W[1] * Rs2.W[1]) + (Rs1.W[0] * Rs2.W[0]); // KMDA32
+ *   Rd = (Rs1.W[1] * Rs2.W[0]) + (Rs1.W[0] * Rs2.W[1]); // KMXDA32
+ * } else {
+ *   Rd = 0x7fffffffffffffff;
+ *   OV = 1;
+ * }
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMXDA32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("kmxda32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.13.2. KMXDA32 ===== */
+
+/* ===== Inline Function Start for 4.14.1. KMADS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMADS32 (Saturating Signed Multiply Two Words & Subtract & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS32 Rd, Rs1, Rs2
+ * KMADRS32 Rd, Rs1, Rs2
+ * KMAXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to
+ * 64-bit data in a third register. The addition result may be saturated.
+ * * KMADS32: rd + (top*top - bottom*bottom)
+ * * KMADRS32: rd + (bottom*bottom - top*top)
+ * * KMAXDS32: rd + (top*bottom - bottom*top)
+ *
+ * **Description**:\n
+ * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the top 32-bit element in Rs2.
+ * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element in Rs1 with the bottom 32-bit element in Rs2.
+ * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the bottom 32-bit element in Rs2.
+ * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is
+ * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to
+ * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated
+ * as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32
+ * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADS32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmads32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.14.1. KMADS32 ===== */
+
+/* ===== Inline Function Start for 4.14.2. KMADRS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMADRS32 (Saturating Signed Multiply Two Words & Reverse Subtract & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS32 Rd, Rs1, Rs2
+ * KMADRS32 Rd, Rs1, Rs2
+ * KMAXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to
+ * 64-bit data in a third register. The addition result may be saturated.
+ * * KMADS32: rd + (top*top - bottom*bottom)
+ * * KMADRS32: rd + (bottom*bottom - top*top)
+ * * KMAXDS32: rd + (top*bottom - bottom*top)
+ *
+ * **Description**:\n
+ * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the top 32-bit element in Rs2.
+ * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element in Rs1 with the bottom 32-bit element in Rs2.
+ * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the bottom 32-bit element in Rs2.
+ * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is
+ * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to
+ * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated
+ * as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32
+ * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMADRS32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmadrs32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.14.2. KMADRS32 ===== */
+
+/* ===== Inline Function Start for 4.14.3. KMAXDS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMAXDS32 (Saturating Signed Crossed Multiply Two Words & Subtract & Add)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMADS32 Rd, Rs1, Rs2
+ * KMADRS32 Rd, Rs1, Rs2
+ * KMAXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from 32-bit elements in two registers; and then
+ * perform a subtraction operation between the two 64-bit results. Then add the subtraction result to
+ * 64-bit data in a third register. The addition result may be saturated.
+ * * KMADS32: rd + (top*top - bottom*bottom)
+ * * KMADRS32: rd + (bottom*bottom - top*top)
+ * * KMAXDS32: rd + (top*bottom - bottom*top)
+ *
+ * **Description**:\n
+ * For the `KMADS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the bottom 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the top 32-bit element in Rs2.
+ * For the `KMADRS32` instruction, it multiplies the top 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element in Rs1 with the bottom 32-bit element in Rs2.
+ * For the `KMAXDS32` instruction, it multiplies the bottom 32-bit element in Rs1 with the top 32-bit
+ * element in Rs2 and then subtracts the result from the result of multiplying the top 32-bit element in
+ * Rs1 with the bottom 32-bit element in Rs2.
+ * The subtraction result is then added to the content of 64-bit data in Rd. If the addition result is
+ * beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit is set to
+ * 1. The 64-bit result after saturation is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated
+ * as signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd + (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMADS32
+ * res = Rd + (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // KMADRS32
+ * res = Rd + (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMAXDS32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMAXDS32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmaxds32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.14.3. KMAXDS32 ===== */
+
+/* ===== Inline Function Start for 4.15.1. KMSDA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMSDA32 (Saturating Signed Multiply Two Words & Add & Subtract)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMSDA32 Rd, Rs1, Rs2
+ * KMSXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
+ * subtracts the two 64-bit results from a third register. The subtraction result may be saturated.
+ * * KMSDA: rd - top*top - bottom*bottom
+ * * KMSXDA: rd - top*bottom - bottom*top
+ *
+ * **Description**:\n
+ * For the `KMSDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2.
+ * For the `KMSXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and multiplies the top 32-bit element of Rs1 with the bottom 32-bit element of Rs2.
+ * The two 64-bit multiplication results are then subtracted from the content of Rd. If the subtraction
+ * result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit
+ * is set to 1. The result after saturation is written to Rd. The 32-bit contents are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd - (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMSDA32
+ * res = Rd - (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMSXDA32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMSDA32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmsda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.15.1. KMSDA32 ===== */
+
+/* ===== Inline Function Start for 4.15.2. KMSXDA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief KMSXDA32 (Saturating Signed Crossed Multiply Two Words & Add & Subtract)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KMSDA32 Rd, Rs1, Rs2
+ * KMSXDA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the 32-bit element of two registers; and then
+ * subtracts the two 64-bit results from a third register. The subtraction result may be saturated.
+ * * KMSDA: rd - top*top - bottom*bottom
+ * * KMSXDA: rd - top*bottom - bottom*top
+ *
+ * **Description**:\n
+ * For the `KMSDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and multiplies the top 32-bit element of Rs1 with the top 32-bit element of Rs2.
+ * For the `KMSXDA32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and multiplies the top 32-bit element of Rs1 with the bottom 32-bit element of Rs2.
+ * The two 64-bit multiplication results are then subtracted from the content of Rd. If the subtraction
+ * result is beyond the Q63 number range (-2^63 <= Q63 <= 2^63-1), it is saturated to the range and the OV bit
+ * is set to 1. The result after saturation is written to Rd. The 32-bit contents are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rd - (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // KMSDA32
+ * res = Rd - (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // KMSXDA32
+ * if (res > (2^63)-1) {
+ *   res = (2^63)-1;
+ *   OV = 1;
+ * } else if (res < -2^63) {
+ *   res = -2^63;
+ *   OV = 1;
+ * }
+ * Rd = res;
+ * ~~~
+ *
+ * \param [in]  t    long type of value stored in t
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_KMSXDA32(long t, unsigned long a, unsigned long b)
+{
+    __ASM volatile("kmsxda32 %0, %1, %2" : "+r"(t) : "r"(a), "r"(b));
+    return t;
+}
+/* ===== Inline Function End for 4.15.2. KMSXDA32 ===== */
+
+/* ===== Inline Function Start for 4.16. KSLL32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief KSLL32 (SIMD 32-bit Saturating Shift Left Logical)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLL32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is a variable from a GPR.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the low-order 5-bits of the value in the Rs2 register.
+ * Any shifted value greater than 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is
+ * saturated to -2^31. And the saturated results are written to Rd. If any saturation is performed, set OV
+ * bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * if (sa != 0) {
+ *   res[(31+sa):0] = Rs1.W[x] << sa;
+ *   if (res > (2^31)-1) {
+ *     res = 0x7fffffff; OV = 1;
+ *   } else if (res < -2^31) {
+ *     res = 0x80000000; OV = 1;
+ *   }
+ *   Rd.W[x] = res[31:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLL32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("ksll32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.16. KSLL32 ===== */
+
+/* ===== Inline Function Start for 4.17. KSLLI32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief KSLLI32 (SIMD 32-bit Saturating Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLLI32 Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical left shift operations with saturation simultaneously. The shift
+ * amount is an immediate value.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are left-shifted logically. The shifted out bits are filled
+ * with zero and the shift amount is specified by the imm5u constant. Any shifted value greater than
+ * 2^31-1 is saturated to 2^31-1. Any shifted value smaller than -2^31 is saturated to -2^31. And the saturated
+ * results are written to Rd. If any saturation is performed, set OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ * if (sa != 0) {
+ *   res[(31+sa):0] = Rs1.W[x] << sa;
+ *   if (res > (2^31)-1) {
+ *     res = 0x7fffffff; OV = 1;
+ *   } else if (res < -2^31) {
+ *     res = 0x80000000; OV = 1;
+ *   }
+ *   Rd.W[x] = res[31:0];
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLLI32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslli32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.17. KSLLI32 ===== */
+
+/* ===== Inline Function Start for 4.18.1. KSLRA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief KSLRA32 (SIMD 32-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA32 Rd, Rs1, Rs2
+ * KSLRA32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q31 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 32-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
+ * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[5:0]. However, the behavior of `Rs2[5:0]==-25 (0x20)` is defined to be
+ * equivalent to the behavior of `Rs2[5:0]==-(25-1) (0x21)`.
+ * The left-shifted results are saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. For the `.u`
+ * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[5:0] < 0) {
+ *   sa = -Rs2[5:0];
+ *   sa = (sa == 32)? 31 : sa;
+ *   if (`.u` form) {
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   } else {
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[4:0];
+ *   res[(31+sa):0] = Rs1.W[x] <<(logic) sa;
+ *   if (res > (2^31)-1) {
+ *     res[31:0] = 0x7fffffff; OV = 1;
+ *   } else if (res < -2^31) {
+ *     res[31:0] = 0x80000000; OV = 1;
+ *   }
+ *   Rd.W[x] = res[31:0];
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA32(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.18.1. KSLRA32 ===== */
+
+/* ===== Inline Function Start for 4.18.2. KSLRA32.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief KSLRA32.u (SIMD 32-bit Shift Left Logical with Saturation or Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSLRA32 Rd, Rs1, Rs2
+ * KSLRA32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q31 saturation for the left shift. The `.u` form performs additional rounding up operations for the
+ * right shift.
+ *
+ * **Description**:\n
+ * The 32-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[5:0]. Rs2[5:0] is in the signed range of [-25, 25-1]. A positive Rs2[5:0] means
+ * logical left shift and a negative Rs2[5:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[5:0]. However, the behavior of `Rs2[5:0]==-25 (0x20)` is defined to be
+ * equivalent to the behavior of `Rs2[5:0]==-(25-1) (0x21)`.
+ * The left-shifted results are saturated to the 32-bit signed integer range of [-2^31, 2^31-1]. For the `.u`
+ * form of the instruction, the right-shifted results are added a 1 to the most significant discarded bit
+ * position for rounding effect. After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:6] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[5:0] < 0) {
+ *   sa = -Rs2[5:0];
+ *   sa = (sa == 32)? 31 : sa;
+ *   if (`.u` form) {
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   } else {
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   sa = Rs2[4:0];
+ *   res[(31+sa):0] = Rs1.W[x] <<(logic) sa;
+ *   if (res > (2^31)-1) {
+ *     res[31:0] = 0x7fffffff; OV = 1;
+ *   } else if (res < -2^31) {
+ *     res[31:0] = 0x80000000; OV = 1;
+ *   }
+ *   Rd.W[x] = res[31:0];
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSLRA32_U(unsigned long a, int b)
+{
+    register unsigned long result;
+    __ASM volatile("kslra32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.18.2. KSLRA32.u ===== */
+
+/* ===== Inline Function Start for 4.19. KSTAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KSTAS32 (SIMD 32-bit Signed Saturating Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSTAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element saturating addition and 32-bit signed integer element
+ * saturating subtraction in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit
+ * elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
+ * integer element in [63:32] of Rs2; at the same time, it subtracts the 32-bit integer element in [31:0] of
+ * Rs2 from the 32-bit integer element in [31:0] of Rs1. If any of the results are beyond the Q31 number
+ * range (-2^31 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to [63:32] of Rd for addition and [31:0] of Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[1] = Rs1.W[1] + Rs2.W[1];
+ * res[0] = Rs1.W[0] - Rs2.W[0];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res[1];
+ * Rd.W[0] = res[0];
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSTAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.19. KSTAS32 ===== */
+
+/* ===== Inline Function Start for 4.20. KSTSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KSTSA32 (SIMD 32-bit Signed Saturating Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIM (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSTSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element saturating subtraction and 32-bit signed integer element
+ * saturating addition in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit
+ * elements.
+ * *Description: *
+ * This instruction subtracts the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer
+ * element in [63:32] of Rs1; at the same time, it adds the 32-bit integer element in [31:0] of Rs1 with
+ * the 32-bit integer element in [31:0] of Rs2. If any of the results are beyond the Q31 number range (-
+ * 231 <= Q31 <= 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are
+ * written to [63:32] of Rd for subtraction and [31:0] of Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[1] = Rs1.W[1] - Rs2.W[1];
+ * res[0] = Rs1.W[0] + Rs2.W[0];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res[1];
+ * Rd.W[0] = res[0];
+ * for RV64, x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSTSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("kstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.20. KSTSA32 ===== */
+
+/* ===== Inline Function Start for 4.21. KSUB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief KSUB32 (SIMD 32-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * KSUB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit
+ * signed integer elements in Rs1. If any of the results are beyond the Q31 number range (-2^31 <= Q31 <=
+ * 2^31-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.W[x] - Rs2.W[x];
+ * if (res[x] > (2^31)-1) {
+ *   res[x] = (2^31)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^31) {
+ *   res[x] = -2^31;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_KSUB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ksub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.21. KSUB32 ===== */
+
+/* ===== Inline Function Start for 4.22.1. PKBB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
+ * \brief PKBB32 (Pack Two 32-bit Data from Both Bottom Half)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB32 Rd, Rs1, Rs2
+ * PKBT32 Rd, Rs1, Rs2
+ * PKTT32 Rd, Rs1, Rs2
+ * PKTB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 32-bit data from 64-bit chunks in two registers.
+ * * PKBB32: bottom.bottom
+ * * PKBT32: bottom.top
+ * * PKTT32: top.top
+ * * PKTB32: top.bottom
+ *
+ * **Description**:\n
+ * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKBB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pkbb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.22.1. PKBB32 ===== */
+
+/* ===== Inline Function Start for 4.22.2. PKBT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
+ * \brief PKBT32 (Pack Two 32-bit Data from Bottom and Top Half)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB32 Rd, Rs1, Rs2
+ * PKBT32 Rd, Rs1, Rs2
+ * PKTT32 Rd, Rs1, Rs2
+ * PKTB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 32-bit data from 64-bit chunks in two registers.
+ * * PKBB32: bottom.bottom
+ * * PKBT32: bottom.top
+ * * PKTT32: top.top
+ * * PKTB32: top.bottom
+ *
+ * **Description**:\n
+ * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKBT32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pkbt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.22.2. PKBT32 ===== */
+
+/* ===== Inline Function Start for 4.22.3. PKTT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
+ * \brief PKTT32 (Pack Two 32-bit Data from Both Top Half)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB32 Rd, Rs1, Rs2
+ * PKBT32 Rd, Rs1, Rs2
+ * PKTT32 Rd, Rs1, Rs2
+ * PKTB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 32-bit data from 64-bit chunks in two registers.
+ * * PKBB32: bottom.bottom
+ * * PKBT32: bottom.top
+ * * PKTT32: top.top
+ * * PKTB32: top.bottom
+ *
+ * **Description**:\n
+ * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKTT32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pktt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.22.3. PKTT32 ===== */
+
+/* ===== Inline Function Start for 4.22.4. PKTB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PACK
+ * \brief PKTB32 (Pack Two 32-bit Data from Top and Bottom Half)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * PKBB32 Rd, Rs1, Rs2
+ * PKBT32 Rd, Rs1, Rs2
+ * PKTT32 Rd, Rs1, Rs2
+ * PKTB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Pack 32-bit data from 64-bit chunks in two registers.
+ * * PKBB32: bottom.bottom
+ * * PKBT32: bottom.top
+ * * PKTT32: top.top
+ * * PKTB32: top.bottom
+ *
+ * **Description**:\n
+ * (PKBB32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ * (PKBT32) moves Rs1.W[0] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTT32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[1] to Rd.W[0].
+ * (PKTB32) moves Rs1.W[1] to Rd.W[1] and moves Rs2.W[0] to Rd.W[0].
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*0*_]); // PKBB32
+ * Rd = CONCAT(Rs1.W[_*0*_], Rs2.W[_*1*_]); // PKBT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*1*_]); // PKTT32
+ * Rd = CONCAT(Rs1.W[_*1*_], Rs2.W[_*0*_]); // PKTB32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_PKTB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("pktb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.22.4. PKTB32 ===== */
+
+/* ===== Inline Function Start for 4.23. RADD32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RADD32 (SIMD 32-bit Signed Halving Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RADD32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element additions simultaneously. The results are halved to avoid
+ * overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit signed integer elements in Rs1 with the 32-bit signed
+ * integer elements in Rs2. The results are first arithmetically right-shifted by 1 bit and then written to
+ * Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Rs1 = 0x7FFFFFFF, Rs2 = 0x7FFFFFFF Rd = 0x7FFFFFFF
+ * * Rs1 = 0x80000000, Rs2 = 0x80000000 Rd = 0x80000000
+ * * Rs1 = 0x40000000, Rs2 = 0x80000000 Rd = 0xE0000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] + Rs2.W[x]) s>> 1;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RADD32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("radd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.23. RADD32 ===== */
+
+/* ===== Inline Function Start for 4.24. RCRAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RCRAS32 (SIMD 32-bit Signed Halving Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RCRAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element addition and 32-bit signed integer element subtraction in
+ * a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit signed integer element in [63:32] of Rs1 with the 32-bit
+ * signed integer element in [31:0] of Rs2, and subtracts the 32-bit signed integer element in [63:32] of
+ * Rs2 from the 32-bit signed integer element in [31:0] of Rs1. The element results are first
+ * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd
+ * for subtraction.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD32` and `RSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] + Rs2.W[0]) s>> 1;
+ * Rd.W[0] = (Rs1.W[0] - Rs2.W[1]) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RCRAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.24. RCRAS32 ===== */
+
+/* ===== Inline Function Start for 4.25. RCRSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RCRSA32 (SIMD 32-bit Signed Halving Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RCRSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element subtraction and 32-bit signed integer element addition in
+ * a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit signed integer element in [31:0] of Rs2 from the
+ * 32-bit signed integer element in [63:32] of Rs1, and adds the 32-bit signed element integer in [31:0]
+ * of Rs1 with the 32-bit signed integer element in [63:32] of Rs2. The two results are first
+ * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of
+ * Rd for addition.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD32` and `RSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] - Rs2.W[0]) s>> 1;
+ * Rd.W[0] = (Rs1.W[0] + Rs2.W[1]) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RCRSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.25. RCRSA32 ===== */
+
+/* ===== Inline Function Start for 4.26. RSTAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RSTAS32 (SIMD 32-bit Signed Halving Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSTAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element addition and 32-bit signed integer element subtraction in
+ * a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The results are
+ * halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit signed integer element in [63:32] of Rs1 with the 32-bit
+ * signed integer element in [63:32] of Rs2, and subtracts the 32-bit signed integer element in [31:0] of
+ * Rs2 from the 32-bit signed integer element in [31:0] of Rs1. The element results are first
+ * arithmetically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd
+ * for subtraction.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD32` and `RSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] + Rs2.W[1]) s>> 1;
+ * Rd.W[0] = (Rs1.W[0] - Rs2.W[0]) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSTAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.26. RSTAS32 ===== */
+
+/* ===== Inline Function Start for 4.27. RSTSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RSTSA32 (SIMD 32-bit Signed Halving Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSTSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element subtraction and 32-bit signed integer element addition in
+ * a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The results are
+ * halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit signed integer element in [63:32] of Rs2 from the
+ * 32-bit signed integer element in [63:32] of Rs1, and adds the 32-bit signed element integer in [31:0]
+ * of Rs1 with the 32-bit signed integer element in [31:0] of Rs2. The two results are first arithmetically
+ * right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for addition.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `RADD32` and `RSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] - Rs2.W[1]) s>> 1;
+ * Rd.W[0] = (Rs1.W[0] + Rs2.W[0]) s>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSTSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.27. RSTSA32 ===== */
+
+/* ===== Inline Function Start for 4.28. RSUB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief RSUB32 (SIMD 32-bit Signed Halving Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * RSUB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit signed integer elements in Rs2 from the 32-bit
+ * signed integer elements in Rs1. The results are first arithmetically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFFFFFF, Rb = 0x80000000 Rt = 0x7FFFFFFF
+ * * Ra = 0x80000000, Rb = 0x7FFFFFFF Rt = 0x80000000
+ * * Ra = 0x80000000, Rb = 0x40000000 Rt = 0xA0000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] - Rs2.W[x]) s>> 1;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_RSUB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("rsub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.28. RSUB32 ===== */
+
+/* ===== Inline Function Start for 4.29. SLL32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SLL32 (SIMD 32-bit Shift Left Logical)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLL32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical left shift operations simultaneously. The shift amount is a
+ * variable from a GPR.
+ *
+ * **Description**:\n
+ * The 32-bit elements in Rs1 are left-shifted logically. And the results are written to Rd.
+ * The shifted out bits are filled with zero and the shift amount is specified by the low-order 5-bits of
+ * the value in the Rs2 register.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * Rd.W[x] = Rs1.W[x] << sa;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SLL32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sll32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.29. SLL32 ===== */
+
+/* ===== Inline Function Start for 4.30. SLLI32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SLLI32 (SIMD 32-bit Shift Left Logical Immediate)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SLLI32 Rd, Rs1, imm5u[4:0]
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit element logical left shift operations simultaneously. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * The 32-bit elements in Rs1 are left-shifted logically. The shifted out bits are filled with
+ * zero and the shift amount is specified by the imm5u[4:0] constant. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ * Rd.W[x] = Rs1.W[x] << sa;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SLLI32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("slli32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.30. SLLI32 ===== */
+
+/* ===== Inline Function Start for 4.31. SMAX32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
+ * \brief SMAX32 (SIMD 32-bit Signed Maximum)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMAX32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 32-bit signed integer elements in Rs1 with the 32-bit
+ * signed integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] > Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMAX32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smax32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.31. SMAX32 ===== */
+
+/* ===== Inline Function Start for 4.32.1. SMBB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT
+ * \brief SMBB32 (Signed Multiply Bottom Word & Bottom Word)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB32 Rd, Rs1, Rs2
+ * SMBT32 Rd, Rs1, Rs2
+ * SMTT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element of a register with the signed 32-bit element of another
+ * register and write the 64-bit result to a third register.
+ * * SMBB32: bottom*bottom
+ * * SMBT32: bottom*top
+ * * SMTT32: top*top
+ *
+ * **Description**:\n
+ * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2. It is actually an alias of `MULSR64` instruction.
+ * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2.
+ * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element
+ * of Rs2.
+ * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1];
+ * // SMTT32 Rd = res;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMBB32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smbb32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.32.1. SMBB32 ===== */
+
+/* ===== Inline Function Start for 4.32.2. SMBT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT
+ * \brief SMBT32 (Signed Multiply Bottom Word & Top Word)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB32 Rd, Rs1, Rs2
+ * SMBT32 Rd, Rs1, Rs2
+ * SMTT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element of a register with the signed 32-bit element of another
+ * register and write the 64-bit result to a third register.
+ * * SMBB32: bottom*bottom
+ * * SMBT32: bottom*top
+ * * SMTT32: top*top
+ *
+ * **Description**:\n
+ * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2. It is actually an alias of `MULSR64` instruction.
+ * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2.
+ * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element
+ * of Rs2.
+ * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1];
+ * // SMTT32 Rd = res;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMBT32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smbt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.32.2. SMBT32 ===== */
+
+/* ===== Inline Function Start for 4.32.3. SMTT32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_MULT
+ * \brief SMTT32 (Signed Multiply Top Word & Top Word)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMBB32 Rd, Rs1, Rs2
+ * SMBT32 Rd, Rs1, Rs2
+ * SMTT32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Multiply the signed 32-bit element of a register with the signed 32-bit element of another
+ * register and write the 64-bit result to a third register.
+ * * SMBB32: bottom*bottom
+ * * SMBT32: bottom*top
+ * * SMTT32: top*top
+ *
+ * **Description**:\n
+ * For the `SMBB32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2. It is actually an alias of `MULSR64` instruction.
+ * For the `SMBT32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2.
+ * For the `SMTT32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit element
+ * of Rs2.
+ * The 64-bit multiplication result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as
+ * signed integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res = Rs1.W[0] * Rs2.W[0]; // SMBB32 res = Rs1.W[0] * Rs2.w[1]; // SMBT32 res = Rs1.W[1] * Rs2.W[1];
+ * // SMTT32 Rd = res;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMTT32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smtt32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.32.3. SMTT32 ===== */
+
+/* ===== Inline Function Start for 4.33.1. SMDS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief SMDS32 (Signed Multiply Two Words and Subtract)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS32 Rd, Rs1, Rs2
+ * SMDRS32 Rd, Rs1, Rs2
+ * SMXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then
+ * perform a subtraction operation between the two 64-bit results.
+ * * SMDS32: top*top - bottom*bottom
+ * * SMDRS32: bottom*bottom - top*top
+ * * SMXDS32: top*bottom - bottom*top
+ *
+ * **Description**:\n
+ * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the top 32-bit element of Rs2.
+ * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element of Rs1 with the bottom 32-bit element of Rs2.
+ * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the bottom 32-bit element of Rs2.
+ * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32
+ * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32
+ * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMDS32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smds32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.33.1. SMDS32 ===== */
+
+/* ===== Inline Function Start for 4.33.2. SMDRS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief SMDRS32 (Signed Multiply Two Words and Reverse Subtract)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS32 Rd, Rs1, Rs2
+ * SMDRS32 Rd, Rs1, Rs2
+ * SMXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then
+ * perform a subtraction operation between the two 64-bit results.
+ * * SMDS32: top*top - bottom*bottom
+ * * SMDRS32: bottom*bottom - top*top
+ * * SMXDS32: top*bottom - bottom*top
+ *
+ * **Description**:\n
+ * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the top 32-bit element of Rs2.
+ * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element of Rs1 with the bottom 32-bit element of Rs2.
+ * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the bottom 32-bit element of Rs2.
+ * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32
+ * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32
+ * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMDRS32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smdrs32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.33.2. SMDRS32 ===== */
+
+/* ===== Inline Function Start for 4.33.3. SMXDS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_32B_PARALLEL_MAC
+ * \brief SMXDS32 (Signed Crossed Multiply Two Words and Subtract)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMDS32 Rd, Rs1, Rs2
+ * SMDRS32 Rd, Rs1, Rs2
+ * SMXDS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do two signed 32-bit multiplications from the l 32-bit element of two registers; and then
+ * perform a subtraction operation between the two 64-bit results.
+ * * SMDS32: top*top - bottom*bottom
+ * * SMDRS32: bottom*bottom - top*top
+ * * SMXDS32: top*bottom - bottom*top
+ *
+ * **Description**:\n
+ * For the `SMDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the bottom 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the top 32-bit element of Rs2.
+ * For the `SMDRS32` instruction, it multiplies the top 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the bottom 32-bit
+ * element of Rs1 with the bottom 32-bit element of Rs2.
+ * For the `SMXDS32` instruction, it multiplies the bottom 32-bit element of Rs1 with the top 32-bit
+ * element of Rs2 and then subtracts the result from the result of multiplying the top 32-bit element of
+ * Rs1 with the bottom 32-bit element of Rs2.
+ * The subtraction result is written to Rd. The 32-bit contents of Rs1 and Rs2 are treated as signed
+ * integers.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rt = (Rs1.W[1] * Rs2.W[1]) - (Rs1.W[0] * Rs2.W[0]); // SMDS32
+ * Rt = (Rs1.W[0] * Rs2.W[0]) - (Rs1.W[1] * Rs2.W[1]); // SMDRS32
+ * Rt = (Rs1.W[1] * Rs2.W[0]) - (Rs1.W[0] * Rs2.W[1]); // SMXDS32
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SMXDS32(unsigned long a, unsigned long b)
+{
+    register long result;
+    __ASM volatile("smxds32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.33.3. SMXDS32 ===== */
+
+/* ===== Inline Function Start for 4.34. SMIN32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
+ * \brief SMIN32 (SIMD 32-bit Signed Minimum)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SMIN32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit signed integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 32-bit signed integer elements in Rs1 with the 32-bit
+ * signed integer elements in Rs2 and selects the numbers that is less than the other one. The selected
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] < Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SMIN32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("smin32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.34. SMIN32 ===== */
+
+/* ===== Inline Function Start for 4.35.1. SRA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRA32 (SIMD 32-bit Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA32 Rd, Rs1, Rs2
+ * SRA32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 5-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 32-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA32.u
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRA32
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sra32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.35.1. SRA32 ===== */
+
+/* ===== Inline Function Start for 4.35.2. SRA32.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRA32.u (SIMD 32-bit Rounding Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRA32 Rd, Rs1, Rs2
+ * SRA32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit element arithmetic right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the data elements. The shift amount is specified by the low-order
+ * 5-bits of the value in the Rs2 register. For the rounding operation of the `.u` form, a value of 1 is
+ * added to the most significant discarded bit of each 32-bit data element to calculate the final results.
+ * And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA32.u
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRA32
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRA32_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("sra32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.35.2. SRA32.u ===== */
+
+/* ===== Inline Function Start for 4.36.1. SRAI32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRAI32 (SIMD 32-bit Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI32 Rd, Rs1, imm5u
+ * SRAI32.u Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements arithmetic right shift operations simultaneously. The shift amount is
+ * an immediate value. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the 32-bit data elements. The shift amount is specified by the
+ * imm5u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
+ * significant discarded bit of each 32-bit data to calculate the final results. And the results are written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ *   if (sa > 0) {
+ *   if (`.u` form) { // SRAI32.u
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRAI32
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRAI32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srai32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.36.1. SRAI32 ===== */
+
+/* ===== Inline Function Start for 4.36.2. SRAI32.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRAI32.u (SIMD 32-bit Rounding Shift Right Arithmetic Immediate)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAI32 Rd, Rs1, imm5u
+ * SRAI32.u Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements arithmetic right shift operations simultaneously. The shift amount is
+ * an immediate value. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted arithmetically, that is, the shifted out
+ * bits are filled with the sign-bit of the 32-bit data elements. The shift amount is specified by the
+ * imm5u constant. For the rounding operation of the `.u` form, a value of 1 is added to the most
+ * significant discarded bit of each 32-bit data to calculate the final results. And the results are written
+ * to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ *   if (sa > 0) {
+ *   if (`.u` form) { // SRAI32.u
+ *     res[31:-1] = SE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRAI32
+ *     Rd.W[x] = SE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRAI32_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srai32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.36.2. SRAI32.u ===== */
+
+/* ===== Inline Function Start for 4.37. SRAIW.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_NON_SIMD_32B_SHIFT
+ * \brief SRAIW.u (Rounding Shift Right Arithmetic Immediate Word)
+ * \details
+ * **Type**: DSP (RV64 only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRAIW.u Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Perform a 32-bit arithmetic right shift operation with rounding. The shift amount is an
+ * immediate value.
+ *
+ * **Description**:\n
+ * This instruction right-shifts the lower 32-bit content of Rs1 arithmetically. The shifted
+ * out bits are filled with the sign-bit Rs1(31) and the shift amount is specified by the imm5u constant.
+ * For the rounding operation, a value of 1 is added to the most significant discarded bit of the data to
+ * calculate the final result. And the result is sign-extended and written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u;
+ * if (sa != 0) {
+ *   res[31:-1] = SE33(Rs1[31:(sa-1)]) + 1;
+ *   Rd = SE32(res[31:0]);
+ * } else {
+ *   Rd = SE32(Rs1.W[0]);
+ * }
+ * ~~~
+ *
+ * \param [in]  a    int type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in long type
+ */
+__STATIC_FORCEINLINE long __RV_SRAIW_U(int a, unsigned int b)
+{
+    register long result;
+    __ASM volatile("sraiw.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.37. SRAIW.u ===== */
+
+/* ===== Inline Function Start for 4.38.1. SRL32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRL32 (SIMD 32-bit Shift Right Logical)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL32 Rd, Rs1, Rs2
+ * SRL32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit element logical right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the low-order 5-bits of the value in the Rs2
+ * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 32-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA32.u
+ *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRA32
+ *     Rd.W[x] = ZE32(Rs1.W[x][31:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.38.1. SRL32 ===== */
+
+/* ===== Inline Function Start for 4.38.2. SRL32.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRL32.u (SIMD 32-bit Rounding Shift Right Logical)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRL32 Rd, Rs1, Rs2
+ * SRL32.u Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit element logical right shift operations simultaneously. The shift amount is a
+ * variable from a GPR. The `.u` form performs additional rounding up operations on the shifted
+ * results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the low-order 5-bits of the value in the Rs2
+ * register. For the rounding operation of the `.u` form, a value of 1 is added to the most significant
+ * discarded bit of each 32-bit data element to calculate the final results. And the results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = Rs2[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRA32.u
+ *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRA32
+ *     Rd.W[x] = ZE32(Rs1.W[x][31:sa])
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRL32_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srl32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.38.2. SRL32.u ===== */
+
+/* ===== Inline Function Start for 4.39.1. SRLI32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRLI32 (SIMD 32-bit Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI32 Rd, Rs1, imm5u
+ * SRLI32.u Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the imm5u constant. For the rounding
+ * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 32-bit
+ * data to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI32.u
+ *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRLI32
+ *     Rd.W[x] = ZE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRLI32(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srli32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.39.1. SRLI32 ===== */
+
+/* ===== Inline Function Start for 4.39.2. SRLI32.u ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_SHIFT
+ * \brief SRLI32.u (SIMD 32-bit Rounding Shift Right Logical Immediate)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SRLI32 Rd, Rs1, imm5u
+ * SRLI32.u Rd, Rs1, imm5u
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit elements logical right shift operations simultaneously. The shift amount is an
+ * immediate value. The `.u` form performs additional rounding up operations on the shifted results.
+ *
+ * **Description**:\n
+ * The 32-bit data elements in Rs1 are right-shifted logically, that is, the shifted out bits
+ * are filled with zero. The shift amount is specified by the imm5u constant. For the rounding
+ * operation of the `.u` form, a value of 1 is added to the most significant discarded bit of each 32-bit
+ * data to calculate the final results. And the results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * sa = imm5u[4:0];
+ * if (sa > 0) {
+ *   if (`.u` form) { // SRLI32.u
+ *     res[31:-1] = ZE33(Rs1.W[x][31:sa-1]) + 1;
+ *     Rd.W[x] = res[31:0];
+ *   else { // SRLI32
+ *     Rd.W[x] = ZE32(Rs1.W[x][31:sa]);
+ *   }
+ * } else {
+ *   Rd = Rs1;
+ * }
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned int type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SRLI32_U(unsigned long a, unsigned int b)
+{
+    register unsigned long result;
+    __ASM volatile("srli32.u %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.39.2. SRLI32.u ===== */
+
+/* ===== Inline Function Start for 4.40. STAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief STAS32 (SIMD 32-bit Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * STAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element addition and 32-bit integer element subtraction in a 64-bit
+ * chunk simultaneously. Operands are from corresponding 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit integer element in [63:32] of Rs1 with the 32-bit
+ * integer element in [63:32] of Rs2, and writes the result to [63:32] of Rd; at the same time, it subtracts
+ * the 32-bit integer element in [31:0] of Rs2 from the 32-bit integer element in [31:0] of Rs1, and
+ * writes the result to [31:0] of Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = Rs1.W[1] + Rs2.W[1];
+ * Rd.W[0] = Rs1.W[0] - Rs2.W[0];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_STAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("stas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.40. STAS32 ===== */
+
+/* ===== Inline Function Start for 4.41. STSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief STSA32 (SIMD 32-bit Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * STSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element subtraction and 32-bit integer element addition in a 64-bit
+ * chunk simultaneously. Operands are from corresponding 32-bit elements.
+ * *Description: *
+ * This instruction subtracts the 32-bit integer element in [63:32] of Rs2 from the 32-bit integer
+ * element in [63:32] of Rs1, and writes the result to [63:32] of Rd; at the same time, it adds the 32-bit
+ * integer element in [31:0] of Rs1 with the 32-bit integer element in [31:0] of Rs2, and writes the result
+ * to [31:0] of Rd
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned operations.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = Rs1.W[1] - Rs2.W[1];
+ * Rd.W[0] = Rs1.W[0] + Rs2.W[0];
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_STSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("stsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.41. STSA32 ===== */
+
+/* ===== Inline Function Start for 4.42. SUB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief SUB32 (SIMD 32-bit Subtraction)
+ * \details
+ * **Type**: DSP (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * SUB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit integer element subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit integer elements in Rs2 from the 32-bit integer
+ * elements in Rs1, and then writes the results to Rd.
+ *
+ * **Note**:\n
+ * This instruction can be used for either signed or unsigned subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = Rs1.W[x] - Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_SUB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("sub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.42. SUB32 ===== */
+
+/* ===== Inline Function Start for 4.43. UKADD32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKADD32 (SIMD 32-bit Unsigned Saturating Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKADD32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer elements in Rs1 with the 32-bit
+ * unsigned integer elements in Rs2. If any of the results are beyond the 32-bit unsigned number
+ * range (0 <= RES <= 2^32-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.W[x] + Rs2.W[x];
+ * if (res[x] > (2^32)-1) {
+ *   res[x] = (2^32)-1;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKADD32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukadd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.43. UKADD32 ===== */
+
+/* ===== Inline Function Start for 4.44. UKCRAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKCRAS32 (SIMD 32-bit Unsigned Saturating Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKCRAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 32-bit unsigned integer element saturating addition and one 32-bit unsigned
+ * integer element saturating subtraction in a 64-bit chunk simultaneously. Operands are from crossed
+ * 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
+ * bit unsigned integer element in [31:0] of Rs2; at the same time, it subtracts the 32-bit unsigned
+ * integer element in [63:32] of Rs2 from the 32-bit unsigned integer element in [31:0] Rs1. If any of the
+ * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
+ * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for addition and
+ * [31:0] of Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[1] + Rs2.W[0];
+ * res2 = Rs1.W[0] - Rs2.W[1];
+ * if (res1 > (2^32)-1) {
+ *   res1 = (2^32)-1;
+ *   OV = 1;
+ * }
+ * if (res2 < 0) {
+ *   res2 = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res1;
+ * Rd.W[0] = res2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKCRAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.44. UKCRAS32 ===== */
+
+/* ===== Inline Function Start for 4.45. UKCRSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKCRSA32 (SIMD 32-bit Unsigned Saturating Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKCRSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 32-bit unsigned integer element saturating subtraction and one 32-bit unsigned
+ * integer element saturating addition in a 64-bit chunk simultaneously. Operands are from crossed
+ * 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer element in [31:0] of Rs2 from the
+ * 32-bit unsigned integer element in [63:32] of Rs1; at the same time, it adds the 32-bit unsigned
+ * integer element in [63:32] of Rs2 with the 32-bit unsigned integer element in [31:0] Rs1. If any of the
+ * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
+ * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for subtraction and
+ * [31:0] of Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[1] - Rs2.W[0];
+ * res2 = Rs1.W[0] + Rs2.W[1];
+ * if (res1 < 0) {
+ *   res1 = 0;
+ *   OV = 1;
+ * } else if (res2 > (2^32)-1) {
+ *   res2 = (2^32)-1;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res1;
+ * Rd.W[0] = res2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKCRSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.45. UKCRSA32 ===== */
+
+/* ===== Inline Function Start for 4.46. UKSTAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKSTAS32 (SIMD 32-bit Unsigned Saturating Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSTAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 32-bit unsigned integer element saturating addition and one 32-bit unsigned
+ * integer element saturating subtraction in a 64-bit chunk simultaneously. Operands are from
+ * corresponding 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
+ * bit unsigned integer element in [63:32] of Rs2; at the same time, it subtracts the 32-bit unsigned
+ * integer element in [31:0] of Rs2 from the 32-bit unsigned integer element in [31:0] Rs1. If any of the
+ * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
+ * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for addition and
+ * [31:0] of Rd for subtraction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[1] + Rs2.W[1];
+ * res2 = Rs1.W[0] - Rs2.W[0];
+ * if (res1 > (2^32)-1) {
+ *   res1 = (2^32)-1;
+ *   OV = 1;
+ * }
+ * if (res2 < 0) {
+ *   res2 = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res1;
+ * Rd.W[0] = res2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSTAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.46. UKSTAS32 ===== */
+
+/* ===== Inline Function Start for 4.47. UKSTSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKSTSA32 (SIMD 32-bit Unsigned Saturating Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSTSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do one 32-bit unsigned integer element saturating subtraction and one 32-bit unsigned
+ * integer element saturating addition in a 64-bit chunk simultaneously. Operands are from
+ * corresponding 32-bit elements.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer element in [63:32] of Rs2 from
+ * the 32-bit unsigned integer element in [63:32] of Rs1; at the same time, it adds the 32-bit unsigned
+ * integer element in [31:0] of Rs2 with the 32-bit unsigned integer element in [31:0] Rs1. If any of the
+ * results are beyond the 32-bit unsigned number range (0 <= RES <= 2^32-1), they are saturated to the
+ * range and the OV bit is set to 1. The saturated results are written to [63:32] of Rd for subtraction and
+ * [31:0] of Rd for addition.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res1 = Rs1.W[1] - Rs2.W[1];
+ * res2 = Rs1.W[0] + Rs2.W[0];
+ * if (res1 < 0) {
+ *   res1 = 0;
+ *   OV = 1;
+ * } else if (res2 > (2^32)-1) {
+ *   res2 = (2^32)-1;
+ *   OV = 1;
+ * }
+ * Rd.W[1] = res1;
+ * Rd.W[0] = res2;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSTSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ukstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.47. UKSTSA32 ===== */
+
+/* ===== Inline Function Start for 4.48. UKSUB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief UKSUB32 (SIMD 32-bit Unsigned Saturating Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UKSUB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer elements in Rs2 from the 32-bit
+ * unsigned integer elements in Rs1. If any of the results are beyond the 32-bit unsigned number
+ * range (0 <= RES <= 2^32-1), they are saturated to the range and the OV bit is set to 1. The saturated
+ * results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.W[x] - Rs2.W[x];
+ * if (res[x] < 0) {
+ *   res[x] = 0;
+ *   OV = 1;
+ * }
+ * Rd.W[x] = res[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UKSUB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uksub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.48. UKSUB32 ===== */
+
+/* ===== Inline Function Start for 4.49. UMAX32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
+ * \brief UMAX32 (SIMD 32-bit Unsigned Maximum)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMAX32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer elements finding maximum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 32-bit unsigned integer elements in Rs1 with the 32-bit
+ * unsigned integer elements in Rs2 and selects the numbers that is greater than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] u> Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMAX32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umax32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.49. UMAX32 ===== */
+
+/* ===== Inline Function Start for 4.50. UMIN32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_MISC
+ * \brief UMIN32 (SIMD 32-bit Unsigned Minimum)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * UMIN32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer elements finding minimum operations simultaneously.
+ *
+ * **Description**:\n
+ * This instruction compares the 32-bit unsigned integer elements in Rs1 with the 32-bit
+ * unsigned integer elements in Rs2 and selects the numbers that is less than the other one. The
+ * selected results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] <u Rs2.W[x])? Rs1.W[x] : Rs2.W[x];
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_UMIN32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("umin32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.50. UMIN32 ===== */
+
+/* ===== Inline Function Start for 4.51. URADD32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URADD32 (SIMD 32-bit Unsigned Halving Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URADD32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element additions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer elements in Rs1 with the 32-bit
+ * unsigned integer elements in Rs2. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFFFFFF, Rb = 0x7FFFFFFF Rt = 0x7FFFFFFF
+ * * Ra = 0x80000000, Rb = 0x80000000 Rt = 0x80000000
+ * * Ra = 0x40000000, Rb = 0x80000000 Rt = 0x60000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] + Rs2.W[x]) u>> 1;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URADD32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("uradd32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.51. URADD32 ===== */
+
+/* ===== Inline Function Start for 4.52. URCRAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URCRAS32 (SIMD 32-bit Unsigned Halving Cross Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URCRAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element addition and 32-bit unsigned integer element
+ * subtraction in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The
+ * results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
+ * bit unsigned integer element in [31:0] of Rs2, and subtracts the 32-bit unsigned integer element in
+ * [63:32] of Rs2 from the 32-bit unsigned integer element in [31:0] of Rs1. The element results are first
+ * logically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd for
+ * subtraction.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD32` and `URSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] + Rs2.W[0]) u>> 1;
+ * Rd.W[0] = (Rs1.W[0] - Rs2.W[1]) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URCRAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urcras32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.52. URCRAS32 ===== */
+
+/* ===== Inline Function Start for 4.53. URCRSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URCRSA32 (SIMD 32-bit Unsigned Halving Cross Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URCRSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element subtraction and 32-bit unsigned integer element
+ * addition in a 64-bit chunk simultaneously. Operands are from crossed 32-bit elements. The results
+ * are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer element in [31:0] of Rs2 from the
+ * 32-bit unsigned integer element in [63:32] of Rs1, and adds the 32-bit unsigned element integer in
+ * [31:0] of Rs1 with the 32-bit unsigned integer element in [63:32] of Rs2. The two results are first
+ * logically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for
+ * addition.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD32` and `URSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] - Rs2.W[0]) u>> 1;
+ * Rd.W[0] = (Rs1.W[0] + Rs2.W[1]) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URCRSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urcrsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.53. URCRSA32 ===== */
+
+/* ===== Inline Function Start for 4.54. URSTAS32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URSTAS32 (SIMD 32-bit Unsigned Halving Straight Addition & Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSTAS32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element addition and 32-bit unsigned integer element
+ * subtraction in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements.
+ * The results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction adds the 32-bit unsigned integer element in [63:32] of Rs1 with the 32-
+ * bit unsigned integer element in [63:32] of Rs2, and subtracts the 32-bit unsigned integer element in
+ * [31:0] of Rs2 from the 32-bit unsigned integer element in [31:0] of Rs1. The element results are first
+ * logically right-shifted by 1 bit and then written to [63:32] of Rd for addition and [31:0] of Rd for
+ * subtraction.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD32` and `URSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] + Rs2.W[1]) u>> 1;
+ * Rd.W[0] = (Rs1.W[0] - Rs2.W[0]) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSTAS32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urstas32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.54. URSTAS32 ===== */
+
+/* ===== Inline Function Start for 4.55. URSTSA32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URSTSA32 (SIMD 32-bit Unsigned Halving Straight Subtraction & Addition)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSTSA32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element subtraction and 32-bit unsigned integer element
+ * addition in a 64-bit chunk simultaneously. Operands are from corresponding 32-bit elements. The
+ * results are halved to avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer element in [63:32] of Rs2 from
+ * the 32-bit unsigned integer element in [63:32] of Rs1, and adds the 32-bit unsigned element integer
+ * in [31:0] of Rs1 with the 32-bit unsigned integer element in [31:0] of Rs2. The two results are first
+ * logically right-shifted by 1 bit and then written to [63:32] of Rd for subtraction and [31:0] of Rd for
+ * addition.
+ *
+ * **Examples**:\n
+ * ~~~
+ * Please see `URADD32` and `URSUB32` instructions.
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[1] = (Rs1.W[1] - Rs2.W[1]) u>> 1;
+ * Rd.W[0] = (Rs1.W[0] + Rs2.W[0]) u>> 1;
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSTSA32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("urstsa32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.55. URSTSA32 ===== */
+
+/* ===== Inline Function Start for 4.56. URSUB32 ===== */
+/**
+ * \ingroup NMSIS_Core_DSP_Intrinsic_RV64_SIMD_32B_ADDSUB
+ * \brief URSUB32 (SIMD 32-bit Unsigned Halving Subtraction)
+ * \details
+ * **Type**: SIMD (RV64 Only)
+ *
+ * **Syntax**:\n
+ * ~~~
+ * URSUB32 Rd, Rs1, Rs2
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 32-bit unsigned integer element subtractions simultaneously. The results are halved to
+ * avoid overflow or saturation.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 32-bit unsigned integer elements in Rs2 from the 32-bit
+ * unsigned integer elements in Rs1. The results are first logically right-shifted by 1 bit and then
+ * written to Rd.
+ *
+ * **Examples**:\n
+ * ~~~
+ * * Ra = 0x7FFFFFFF, Rb = 0x80000000, Rt = 0xFFFFFFFF
+ * * Ra = 0x80000000, Rb = 0x7FFFFFFF, Rt = 0x00000000
+ * * Ra = 0x80000000, Rb = 0x40000000, Rt = 0x20000000
+ * ~~~
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x] = (Rs1.W[x] - Rs2.W[x]) u>> 1;
+ * for RV64: x=1...0
+ * ~~~
+ *
+ * \param [in]  a    unsigned long type of value stored in a
+ * \param [in]  b    unsigned long type of value stored in b
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_URSUB32(unsigned long a, unsigned long b)
+{
+    register unsigned long result;
+    __ASM volatile("ursub32 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for 4.56. URSUB32 ===== */
+
+#endif /* __RISCV_XLEN == 64 */
+
+
+#if (__RISCV_XLEN == 32) || defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__)
+/* XXXXX Nuclei Extended DSP Instructions for RV32 XXXXX */
+/**
+ * \defgroup NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM      Nuclei Customized DSP Instructions
+ * \ingroup  NMSIS_Core_DSP_Intrinsic
+ * \brief    (RV32 only)Nuclei Customized DSP Instructions
+ * \details  This is Nuclei customized DSP instructions only for RV32
+ */
+/* ===== Inline Function Start for A.1. DKHM8 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKHM8 (64-bit SIMD Signed Saturating Q7 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKHM8 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q7xQ7 element multiplications simultaneously. The Q14 results are then reduced to Q7
+ * numbers again.
+ *
+ * **Description**:\n
+ * For the `DKHM8` instruction, multiply the top 8-bit Q7 content of 16-bit chunks in Rs1
+ * with the top 8-bit Q7 content of 16-bit chunks in Rs2. At the same time, multiply the bottom 8-bit Q7
+ * content of 16-bit chunks in Rs1 with the bottom 8-bit Q7 content of 16-bit chunks in Rs2.
+ *
+ * The Q14 results are then right-shifted 7-bits and saturated into Q7 values. The Q7 results are then
+ * written into Rd. When both the two Q7 inputs of a multiplication are 0x80, saturation will happen.
+ * The result will be saturated to 0x7F and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * op1t = Rs1.B[x+1]; op2t = Rs2.B[x+1]; // top
+ * op1b = Rs1.B[x]; op2b = Rs2.B[x]; // bottom
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x80 != aop | 0x80 != bop) {
+ *     res = (aop s* bop) >> 7;
+ *   } else {
+ *     res= 0x7F;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.H[x/2] = concat(rest, resb);
+ * for RV32, x=0,2,4,6
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKHM8(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dkhm8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.1. DKHM8 ===== */
+
+/* ===== Inline Function Start for A.2. DKHM16 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKHM16 (64-bit SIMD Signed Saturating Q15 Multiply)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKHM16 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do Q15xQ15 element multiplications simultaneously. The Q30 results are then reduced to
+ * Q15 numbers again.
+ *
+ * **Description**:\n
+ * For the `DKHM16` instruction, multiply the top 16-bit Q15 content of 32-bit chunks in
+ * Rs1 with the top 16-bit Q15 content of 32-bit chunks in Rs2. At the same time, multiply the bottom
+ * 16-bit Q15 content of 32-bit chunks in Rs1 with the bottom 16-bit Q15 content of 32-bit chunks in
+ * Rs2.
+ *
+ * The Q30 results are then right-shifted 15-bits and saturated into Q15 values. The Q15 results are
+ * then written into Rd. When both the two Q15 inputs of a multiplication are 0x8000, saturation will
+ * happen. The result will be saturated to 0x7FFF and the overflow flag OV will be set.
+ *
+ * **Operations**:\n
+ * ~~~
+ * op1t = Rs1.H[x+1]; op2t = Rs2.H[x+1]; // top
+ * op1b = Rs1.H[x]; op2b = Rs2.H[x]; // bottom
+ * for ((aop,bop,res) in [(op1t,op2t,rest), (op1b,op2b,resb)]) {
+ *   if (0x8000 != aop | 0x8000 != bop) {
+ *     res = (aop s* bop) >> 15;
+ *   } else {
+ *     res= 0x7FFF;
+ *     OV = 1;
+ *   }
+ * }
+ * Rd.W[x/2] = concat(rest, resb);
+ * for RV32: x=0, 2
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKHM16(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dkhm16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.2. DKHM16 ===== */
+
+/* ===== Inline Function Start for A.3. DKABS8 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKABS8 (64-bit SIMD 8-bit Saturating Absolute)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKABS8 Rd, Rs1
+ * # Rd, Rs1 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of 8-bit signed integer elements simultaneously.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of 8-bit signed integer elements stored
+ * in Rs1 and writes the element results to Rd. If the input number is 0x80, this instruction generates
+ * 0x7f as the output and sets the OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.B[x];
+ * if (src == 0x80) {
+ *   src = 0x7f;
+ *   OV = 1;
+ * } else if (src[7] == 1)
+ *   src = -src;
+ * }
+ * Rd.B[x] = src;
+ * for RV32: x=7...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKABS8(unsigned long long a)
+{
+    unsigned long long result;
+    __ASM volatile("dkabs8 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A.3. DKABS8 ===== */
+
+/* ===== Inline Function Start for A.4. DKABS16 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKABS16 (64-bit SIMD 16-bit Saturating Absolute)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKABS16 Rd, Rs1
+ * # Rd, Rs1 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Get the absolute value of 16-bit signed integer elements simultaneously.
+ *
+ * **Description**:\n
+ * This instruction calculates the absolute value of 16-bit signed integer elements stored
+ * in Rs1 and writes the element results to Rd. If the input number is 0x8000, this instruction
+ * generates 0x7fff as the output and sets the OV bit to 1.
+ *
+ * **Operations**:\n
+ * ~~~
+ * src = Rs1.H[x];
+ * if (src == 0x8000) {
+ *   src = 0x7fff;
+ *   OV = 1;
+ * } else if (src[15] == 1)
+ *   src = -src;
+ * }
+ * Rd.H[x] = src;
+ * for RV32: x=3...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKABS16(unsigned long long a)
+{
+    unsigned long long result;
+    __ASM volatile("dkabs16 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A.4. DKABS16 ===== */
+
+/* ===== Inline Function Start for A.5. DKSLRA8 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKSLRA8 (64-bit SIMD 8-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKSLRA8 Rd, Rs1, Rs2
+ * # Rd, Rs1 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q7 saturation for the left shift.
+ *
+ * **Description**:\n
+ * The 8-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[3:0]. Rs2[3:0] is in the signed range of [-2^3, 2^3-1]. A positive Rs2[3:0] means
+ * logical left shift and a negative Rs2[3:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[3:0]. However, the behavior of `Rs2[3:0]==-2^3 (0x8)` is defined to be
+ * equivalent to the behavior of `Rs2[3:0]==-(2^3-1) (0x9)`.
+ * The left-shifted results are saturated to the 8-bit signed integer range of [-2^7, 2^7-1].
+ * If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:4] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[3:0] < 0) {
+ *   sa = -Rs2[3:0];
+ *   sa = (sa == 8)? 7 : sa;
+ *   Rd.B[x] = SE8(Rs1.B[x][7:sa]);
+ * } else {
+ *   sa = Rs2[2:0];
+ *   res[(7+sa):0] = Rs1.B[x] <<(logic) sa;
+ *   if (res > (2^7)-1) {
+ *     res[7:0] = 0x7f; OV = 1;
+ *   } else if (res < -2^7) {
+ *     res[7:0] = 0x80; OV = 1;
+ *   }
+ *   Rd.B[x] = res[7:0];
+ * }
+ * for RV32: x=7...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKSLRA8(unsigned long long a, int b)
+{
+    unsigned long long result;
+    __ASM volatile("dkslra8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.5. DKSLRA8 ===== */
+
+/* ===== Inline Function Start for A.6. DKSLRA16 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKSLRA16 (64-bit SIMD 16-bit Shift Left Logical with Saturation or Shift Right Arithmetic)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKSLRA16 Rd, Rs1, Rs2
+ * # Rd, Rs1 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit elements logical left (positive) or arithmetic right (negative) shift operation with
+ * Q15 saturation for the left shift.
+ *
+ * **Description**:\n
+ * The 16-bit data elements of Rs1 are left-shifted logically or right-shifted arithmetically
+ * based on the value of Rs2[4:0]. Rs2[4:0] is in the signed range of [-2^4, 2^4-1]. A positive Rs2[4:0] means
+ * logical left shift and a negative Rs2[4:0] means arithmetic right shift. The shift amount is the
+ * absolute value of Rs2[4:0]. However, the behavior of `Rs2[4:0]==-2^4 (0x10)` is defined to be
+ * equivalent to the behavior of `Rs2[4:0]==-(2^4-1) (0x11)`.
+ * The left-shifted results are saturated to the 16-bit signed integer range of [-2^15, 2^15-1].
+ * After the shift, saturation, or rounding, the final results are written to
+ * Rd. If any saturation happens, this instruction sets the OV flag. The value of Rs2[31:5] will not affect
+ * this instruction.
+ *
+ * **Operations**:\n
+ * ~~~
+ * if (Rs2[4:0] < 0) {
+ *   sa = -Rs2[4:0];
+ *   sa = (sa == 16)? 15 : sa;
+ *   Rd.H[x] = SE16(Rs1.H[x][15:sa]);
+ * } else {
+ *   sa = Rs2[3:0];
+ *   res[(15+sa):0] = Rs1.H[x] <<(logic) sa;
+ *   if (res > (2^15)-1) {
+ *     res[15:0] = 0x7fff; OV = 1;
+ *   } else if (res < -2^15) {
+ *     res[15:0] = 0x8000; OV = 1;
+ *   }
+ *   d.H[x] = res[15:0];
+ * }
+ * for RV32: x=3...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b int type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKSLRA16(unsigned long long a, int b)
+{
+    unsigned long long result;
+    __ASM volatile("dkslra16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.6. DKSLRA16 ===== */
+
+/* ===== Inline Function Start for A.7. DKADD8 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKADD8 (64-bit SIMD 8-bit Signed Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKADD8 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 8-bit signed integer elements in Rs1 with the 8-bit signed
+ * integer elements in Rs2. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1), they
+ * are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] + Rs2.B[x];
+ * if (res[x] > 127) {
+ *   res[x] = 127;
+ *   OV = 1;
+ * } else if (res[x] < -128) {
+ *   res[x] = -128;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=7...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKADD8(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dkadd8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.7. DKADD8 ===== */
+
+/* ===== Inline Function Start for A.8. DKADD16 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKADD16 (64-bit SIMD 16-bit Signed Saturating Addition)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKADD16 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer element saturating additions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction adds the 16-bit signed integer elements in Rs1 with the 16-bit signed
+ * integer elements in Rs2. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <= 2^15-1),
+ * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] + Rs2.H[x];
+ * if (res[x] > 32767) {
+ *   res[x] = 32767;
+ *   OV = 1;
+ * } else if (res[x] < -32768) {
+ *   res[x] = -32768;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=3...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKADD16(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dkadd16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.8. DKADD16 ===== */
+
+/* ===== Inline Function Start for A.10. DKSUB8 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKSUB8 (64-bit SIMD 8-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKSUB8 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 8-bit signed elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 8-bit signed integer elements in Rs2 from the 8-bit
+ * signed integer elements in Rs1. If any of the results are beyond the Q7 number range (-2^7 <= Q7 <= 2^7-1),
+ * they are saturated to the range and the OV bit is set to 1. The saturated results are written to Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.B[x] - Rs2.B[x];
+ * if (res[x] > (2^7)-1) {
+ *   res[x] = (2^7)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^7) {
+ *   res[x] = -2^7;
+ *   OV = 1;
+ * }
+ * Rd.B[x] = res[x];
+ * for RV32: x=7...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKSUB8(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dksub8 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.9. DKSUB8 ===== */
+
+/* ===== Inline Function Start for A.10. DKSUB16 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief DKSUB16 (64-bit SIMD 16-bit Signed Saturating Subtraction)
+ * \details
+ * **Type**: SIMD
+ *
+ * **Syntax**:\n
+ * ~~~
+ * DKSUB16 Rd, Rs1, Rs2
+ * # Rd, Rs1, Rs2 are all even/odd pair of registers
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Do 16-bit signed integer elements saturating subtractions simultaneously.
+ *
+ * **Description**:\n
+ * This instruction subtracts the 16-bit signed integer elements in Rs2 from the 16-bit
+ * signed integer elements in Rs1. If any of the results are beyond the Q15 number range (-2^15 <= Q15 <=
+ * 2^15-1), they are saturated to the range and the OV bit is set to 1. The saturated results are written to
+ * Rd.
+ *
+ * **Operations**:\n
+ * ~~~
+ * res[x] = Rs1.H[x] - Rs2.H[x];
+ * if (res[x] > (2^15)-1) {
+ *   res[x] = (2^15)-1;
+ *   OV = 1;
+ * } else if (res[x] < -2^15) {
+ *   res[x] = -2^15;
+ *   OV = 1;
+ * }
+ * Rd.H[x] = res[x];
+ * for RV32: x=3...0,
+ * ~~~
+ *
+ * \param [in]  a unsigned long long type of value stored in a
+ * \param [in]  b unsigned long long type of value stored in b
+ * \return value stored in unsigned long long type
+ */
+__STATIC_FORCEINLINE unsigned long long __RV_DKSUB16(unsigned long long a, unsigned long long b)
+{
+    unsigned long long result;
+    __ASM volatile("dksub16 %0, %1, %2" : "=r"(result) : "r"(a), "r"(b));
+    return result;
+}
+/* ===== Inline Function End for A.10. DKSUB16 ===== */
+
+/* ===== Inline Function Start for A.11.1. EXPD80 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief EXPD80 (Expand and Copy Byte 0 to 32bit)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * EXPD80 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
+ *
+ * **Description**:\n
+ * Moves Rs1.B[0][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.B[0][7:0], Rs1.B[0][7:0], Rs1.B[0][7:0], Rs1.B[0][7:0]);
+ * for RV32: x=0
+ * ~~~
+ *
+ * \param [in]  a unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_EXPD80(unsigned long a)
+{
+    unsigned long result;
+    __ASM volatile("expd80 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A11.1. EXPD80 ===== */
+
+/* ===== Inline Function Start for A.11.2. EXPD81 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief EXPD81 (Expand and Copy Byte 1 to 32bit)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * EXPD81 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
+ *
+ * **Description**:\n
+ * Moves Rs1.B[1][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.B[1][7:0], Rs1.B[1][7:0], Rs1.B[1][7:0], Rs1.B[1][7:0]);
+ * for RV32: x=0
+ * ~~~
+ *
+ * \param [in]  a unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_EXPD81(unsigned long a)
+{
+    unsigned long result;
+    __ASM volatile("expd81 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A11.2. EXPD81 ===== */
+
+/* ===== Inline Function Start for A.11.3. EXPD82 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief EXPD82 (Expand and Copy Byte 2 to 32bit)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * EXPD82 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
+ *
+ * **Description**:\n
+ * Moves Rs1.B[2][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.B[2][7:0], Rs1.B[2][7:0], Rs1.B[2][7:0], Rs1.B[2][7:0]);
+ * for RV32: x=0
+ * ~~~
+ *
+ * \param [in]  a unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_EXPD82(unsigned long a)
+{
+    unsigned long result;
+    __ASM volatile("expd82 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A11.3. EXPD82 ===== */
+
+/* ===== Inline Function Start for A.11.4. EXPD83 ===== */
+/**
+ * \ingroup  NMSIS_Core_DSP_Intrinsic_NUCLEI_CUSTOM
+ * \brief EXPD83 (Expand and Copy Byte 3 to 32bit)
+ * \details
+ * **Type**: DSP
+ *
+ * **Syntax**:\n
+ * ~~~
+ * EXPD83 Rd, Rs1
+ * ~~~
+ *
+ * **Purpose**:\n
+ * Copy 8-bit data from 32-bit chunks into 4 bytes in a register.
+ *
+ * **Description**:\n
+ * Moves Rs1.B[3][7:0] to Rd.[0][7:0], Rd.[1][7:0], Rd.[2][7:0], Rd.[3][7:0]
+ *
+ * **Operations**:\n
+ * ~~~
+ * Rd.W[x][31:0] = CONCAT(Rs1.B[3][7:0], Rs1.B[3][7:0], Rs1.B[3][7:0], Rs1.B[3][7:0]);
+ * for RV32: x=0
+ * ~~~
+ *
+ * \param [in]  a unsigned long type of value stored in a
+ * \return value stored in unsigned long type
+ */
+__STATIC_FORCEINLINE unsigned long __RV_EXPD83(unsigned long a)
+{
+    unsigned long result;
+    __ASM volatile("expd83 %0, %1" : "=r"(result) : "r"(a));
+    return result;
+}
+/* ===== Inline Function End for A11.4. EXPD83 ===== */
+#endif /* __RISCV_XLEN == 32 */
+
+#if defined(__RISCV_FEATURE_DSP) && (__RISCV_FEATURE_DSP == 1)
+/* XXXXX ARM Compatiable SIMD API XXXXX */
+/** \brief Q setting quad 8-bit saturating addition. */
+#define __QADD8(x, y)               __RV_KADD8(x, y)
+/** \brief Q setting quad 8-bit saturating subtract. */
+#define __QSUB8(x, y)               __RV_KSUB8((x), (y))
+/** \brief Q setting dual 16-bit saturating addition. */
+#define __QADD16(x, y)              __RV_KADD16((x), (y))
+/** \brief Dual 16-bit signed addition with halved results. */
+#define __SHADD16(x, y)             __RV_RADD16((x), (y))
+/** \brief Q setting dual 16-bit saturating subtract. */
+#define __QSUB16(x, y)              __RV_KSUB16((x), (y))
+/** \brief Dual 16-bit signed subtraction with halved results. */
+#define __SHSUB16(x, y)             __RV_RSUB16((x), (y))
+/** \brief Q setting dual 16-bit add and subtract with exchange. */
+#define __QASX(x, y)                __RV_KCRAS16((x), (y))
+/** \brief Dual 16-bit signed addition and subtraction with halved results.*/
+#define __SHASX(x, y)               __RV_RCRAS16((x), (y))
+/** \brief Q setting dual 16-bit subtract and add with exchange. */
+#define __QSAX(x, y)                __RV_KCRSA16((x), (y))
+/** \brief Dual 16-bit signed subtraction and addition with halved results.*/
+#define __SHSAX(x, y)               __RV_RCRSA16((x), (y))
+/** \brief Dual 16-bit signed multiply with exchange returning difference. */
+#define __SMUSDX(x, y)              __RV_SMXDS((y), (x))
+/** \brief Q setting sum of dual 16-bit signed multiply with exchange. */
+__STATIC_FORCEINLINE int32_t __SMUADX (int32_t op1, int32_t op2)
+{
+    return (int32_t)__RV_KMXDA(op1, op2);
+}
+/** \brief Q setting saturating add. */
+#define __QADD(x, y)                __RV_KADDW((x), (y))
+/** \brief Q setting saturating subtract. */
+#define __QSUB(x, y)                __RV_KSUBW((x), (y))
+/** \brief Q setting dual 16-bit signed multiply with single 32-bit accumulator. */
+__STATIC_FORCEINLINE int32_t __SMLAD(int32_t op1, int32_t op2, int32_t op3)
+{
+    return (int32_t)__RV_KMADA(op3, op1, op2);
+}
+/** \brief Q setting pre-exchanged dual 16-bit signed multiply with single 32-bit accumulator.  */
+__STATIC_FORCEINLINE int32_t __SMLADX(int32_t op1, int32_t op2, int32_t op3)
+{
+    return (int32_t)__RV_KMAXDA(op3, op1, op2);
+}
+/** \brief Q setting dual 16-bit signed multiply with exchange subtract with 32-bit accumulate.  */
+__STATIC_FORCEINLINE int32_t __SMLSDX(int32_t op1, int32_t op2, int32_t op3)
+{
+    return (op3 - (int32_t)__RV_SMXDS(op1, op2));
+}
+/** \brief Dual 16-bit signed multiply with single 64-bit accumulator. */
+__STATIC_FORCEINLINE int64_t __SMLALD(int32_t op1, int32_t op2, int64_t acc)
+{
+    return (int64_t)__RV_SMALDA(acc, op1, op2);
+}
+/** \brief Dual 16-bit signed multiply with exchange with single 64-bit accumulator.  */
+__STATIC_FORCEINLINE int64_t __SMLALDX(int32_t op1, int32_t op2, int64_t acc)
+{
+    return (int64_t)__RV_SMALXDA(acc, op1, op2);
+}
+/** \brief Q setting sum of dual 16-bit signed multiply. */
+__STATIC_FORCEINLINE int32_t __SMUAD(int32_t op1, int32_t op2)
+{
+    return (int32_t)__RV_KMDA(op1, op2);
+}
+/** \brief Dual 16-bit signed multiply returning difference. */
+__STATIC_FORCEINLINE int32_t __SMUSD(int32_t op1, int32_t op2)
+{
+    return (int32_t)__RV_SMDRS(op1, op2);
+}
+/** \brief Dual extract 8-bits and sign extend each to 16-bits. */
+#define __SXTB16(x)             __RV_SUNPKD820(x)
+/** \brief Dual extracted 8-bit to 16-bit signed addition. TODO Need test */
+__STATIC_FORCEINLINE int32_t __SXTAB16(uint32_t op1, uint32_t op2)
+{
+    return __RV_ADD16(op1, __RV_SUNPKD830(op2));
+}
+/** \brief 32-bit signed multiply with 32-bit truncated accumulator. */
+__STATIC_FORCEINLINE int32_t __SMMLA(int32_t op1, int32_t op2, int32_t op3)
+{
+    int32_t mul;
+    mul = (int32_t)__RV_SMMUL(op1, op2);
+    return (op3 + mul);
+}
+#define __DKHM8                 __RV_DKHM8
+#define __DKHM16                __RV_DKHM16
+#define __DKSUB16               __RV_DKSUB16
+#define __SMAQA                 __RV_SMAQA
+#define __MULSR64               __RV_MULSR64
+#define __DQADD8                __RV_DKADD8
+#define __DQSUB8                __RV_DKSUB8
+#define __DKADD16               __RV_DKADD16
+#define __PKBB16                __RV_PKBB16
+#define __DKSLRA16              __RV_DKSLRA16
+#define __DKSLRA8               __RV_DKSLRA8
+#define __KABSW                 __RV_KABSW
+#define __DKABS8                __RV_DKABS8
+#define __DKABS16               __RV_DKABS16
+#define __SMALDA                __RV_SMALDA
+#define __SMSLDA                __RV_SMSLDA
+#define __SMALBB                __RV_SMALBB
+#define __SUB64                 __RV_SUB64
+#define __ADD64                 __RV_ADD64
+#define __SMBB16                __RV_SMBB16
+#define __SMBT16                __RV_SMBT16
+#define __SMTT16                __RV_SMTT16
+#define __EXPD80                __RV_EXPD80
+#define __SMAX8                 __RV_SMAX8
+#define __SMAX16                __RV_SMAX16
+#define __PKTT16                __RV_PKTT16
+#define __KADD16                __RV_KADD16
+#define __SADD16                __RV_ADD16
+
+#endif /* (__RISCV_FEATURE_DSP == 1) */
+
+#endif /* defined(__DSP_PRESENT) && (__DSP_PRESENT == 1) */
+
+/** \brief Halfword packing instruction. Combines bits[15:0] of val1 with bits[31:16] of val2 levitated with the val3. */
+#define __PKHBT(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0x0000FFFFUL) |  \
+                                           ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL)  )
+/** \brief Halfword packing instruction. Combines bits[31:16] of val1 with bits[15:0] of val2 right-shifted with the val3. */
+#define __PKHTB(ARG1,ARG2,ARG3)          ( ((((uint32_t)(ARG1))          ) & 0xFFFF0000UL) |  \
+                                           ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL)  )
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_FEATURE_DSP__ */
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_eclic.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_eclic.h
new file mode 100644
index 000000000..e2075471a
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_eclic.h
@@ -0,0 +1,897 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_ECLIC__
+#define __CORE_FEATURE_ECLIC__
+/*!
+ * @file     core_feature_eclic.h
+ * @brief    ECLIC feature API header file for Nuclei N/NX Core
+ */
+/*
+ * ECLIC Feature Configuration Macro:
+ * 1. __ECLIC_PRESENT:  Define whether Enhanced Core Local Interrupt Controller (ECLIC) Unit is present or not
+ *   * 0: Not present
+ *   * 1: Present
+ * 2. __ECLIC_BASEADDR:  Base address of the ECLIC unit.
+ * 3. ECLIC_GetInfoCtlbits():  Define the number of hardware bits are actually implemented in the clicintctl registers.
+ *   Valid number is 1 - 8.
+ * 4. __ECLIC_INTNUM  : Define the external interrupt number of ECLIC Unit
+ *
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if defined(__ECLIC_PRESENT) && (__ECLIC_PRESENT == 1)
+/**
+ * \defgroup NMSIS_Core_ECLIC_Registers     Register Define and Type Definitions Of ECLIC
+ * \ingroup NMSIS_Core_Registers
+ * \brief   Type definitions and defines for eclic registers.
+ *
+ * @{
+ */
+
+/**
+ * \brief  Union type to access CLICFG configure register.
+ */
+typedef union
+{
+    struct {
+        uint8_t _reserved0:1;                   /*!< bit:     0   Overflow condition code flag */
+        uint8_t nlbits:4;                       /*!< bit:     29  Carry condition code flag */
+        uint8_t _reserved1:2;                   /*!< bit:     30  Zero condition code flag */
+        uint8_t _reserved2:1;                   /*!< bit:     31  Negative condition code flag */
+    } b;                                        /*!< Structure used for bit  access */
+    uint8_t w;                                  /*!< Type      used for byte access */
+} CLICCFG_Type;
+
+/**
+ * \brief  Union type to access CLICINFO information register.
+ */
+typedef union {
+    struct {
+        uint32_t numint:13;                     /*!< bit:  0..12   number of maximum interrupt inputs supported */
+        uint32_t version:8;                     /*!< bit:  13..20  20:17 for architecture version,16:13 for implementation version */
+        uint32_t intctlbits:4;                  /*!< bit:  21..24  specifies how many hardware bits are actually implemented in the clicintctl registers */
+        uint32_t _reserved0:8;                  /*!< bit:  25..31  Reserved */
+    } b;                                        /*!< Structure used for bit  access */
+    uint32_t w;                                 /*!< Type      used for word access */
+} CLICINFO_Type;
+
+/**
+ * \brief Access to the structure of a vector interrupt controller.
+ */
+typedef struct {
+    __IOM uint8_t  INTIP;                       /*!< Offset: 0x000 (R/W)  Interrupt set pending register */
+    __IOM uint8_t  INTIE;                       /*!< Offset: 0x001 (R/W)  Interrupt set enable register */
+    __IOM uint8_t  INTATTR;                     /*!< Offset: 0x002 (R/W)  Interrupt set attributes register */
+    __IOM uint8_t  INTCTRL;                     /*!< Offset: 0x003 (R/W)  Interrupt configure register */
+} CLIC_CTRL_Type;
+
+typedef struct {
+    __IOM uint8_t   CFG;                        /*!< Offset: 0x000 (R/W)  CLIC configuration register */
+    uint8_t RESERVED0[3];
+    __IM uint32_t  INFO;                        /*!< Offset: 0x004 (R/ )  CLIC information register */
+    uint8_t RESERVED1[3];
+    __IOM uint8_t  MTH;                         /*!< Offset: 0x00B (R/W)  CLIC machine mode threshold register */
+    uint32_t RESERVED2[0x3FD];
+    CLIC_CTRL_Type CTRL[4096];                  /*!< Offset: 0x1000 (R/W) CLIC register structure for INTIP, INTIE, INTATTR, INTCTL */
+} CLIC_Type;
+
+#define CLIC_CLICCFG_NLBIT_Pos                 1U                                       /*!< CLIC CLICCFG: NLBIT Position */
+#define CLIC_CLICCFG_NLBIT_Msk                 (0xFUL << CLIC_CLICCFG_NLBIT_Pos)        /*!< CLIC CLICCFG: NLBIT Mask */
+
+#define CLIC_CLICINFO_CTLBIT_Pos                21U                                     /*!< CLIC INTINFO: __ECLIC_GetInfoCtlbits() Position */
+#define CLIC_CLICINFO_CTLBIT_Msk                (0xFUL << CLIC_CLICINFO_CTLBIT_Pos)     /*!< CLIC INTINFO: __ECLIC_GetInfoCtlbits() Mask */
+
+#define CLIC_CLICINFO_VER_Pos                  13U                                      /*!< CLIC CLICINFO: VERSION Position */
+#define CLIC_CLICINFO_VER_Msk                  (0xFFUL << CLIC_CLICCFG_NLBIT_Pos)       /*!< CLIC CLICINFO: VERSION Mask */
+
+#define CLIC_CLICINFO_NUM_Pos                  0U                                       /*!< CLIC CLICINFO: NUM Position */
+#define CLIC_CLICINFO_NUM_Msk                  (0xFFFUL << CLIC_CLICINFO_NUM_Pos)       /*!< CLIC CLICINFO: NUM Mask */
+
+#define CLIC_INTIP_IP_Pos                      0U                                       /*!< CLIC INTIP: IP Position */
+#define CLIC_INTIP_IP_Msk                      (0x1UL << CLIC_INTIP_IP_Pos)             /*!< CLIC INTIP: IP Mask */
+
+#define CLIC_INTIE_IE_Pos                      0U                                       /*!< CLIC INTIE: IE Position */
+#define CLIC_INTIE_IE_Msk                      (0x1UL << CLIC_INTIE_IE_Pos)             /*!< CLIC INTIE: IE Mask */
+
+#define CLIC_INTATTR_TRIG_Pos                  1U                                       /*!< CLIC INTATTR: TRIG Position */
+#define CLIC_INTATTR_TRIG_Msk                  (0x3UL << CLIC_INTATTR_TRIG_Pos)         /*!< CLIC INTATTR: TRIG Mask */
+
+#define CLIC_INTATTR_SHV_Pos                   0U                                       /*!< CLIC INTATTR: SHV Position */
+#define CLIC_INTATTR_SHV_Msk                   (0x1UL << CLIC_INTATTR_SHV_Pos)          /*!< CLIC INTATTR: SHV Mask */
+
+#define ECLIC_MAX_NLBITS                       8U                                       /*!< Max nlbit of the CLICINTCTLBITS */
+#define ECLIC_MODE_MTVEC_Msk                   3U                                       /*!< ECLIC Mode mask for MTVT CSR Register */
+
+#define ECLIC_NON_VECTOR_INTERRUPT             0x0                                      /*!< Non-Vector Interrupt Mode of ECLIC */
+#define ECLIC_VECTOR_INTERRUPT                 0x1                                      /*!< Vector Interrupt Mode of ECLIC */
+
+/**\brief ECLIC Trigger Enum for different Trigger Type */
+typedef enum ECLIC_TRIGGER {
+    ECLIC_LEVEL_TRIGGER = 0x0,          /*!< Level Triggerred, trig[0] = 0 */
+    ECLIC_POSTIVE_EDGE_TRIGGER = 0x1,   /*!< Postive/Rising Edge Triggered, trig[1] = 1, trig[0] = 0 */
+    ECLIC_NEGTIVE_EDGE_TRIGGER = 0x3,   /*!< Negtive/Falling Edge Triggered, trig[1] = 1, trig[0] = 0 */
+    ECLIC_MAX_TRIGGER = 0x3             /*!< MAX Supported Trigger Mode */
+} ECLIC_TRIGGER_Type;
+
+#ifndef __ECLIC_BASEADDR
+/* Base address of ECLIC(__ECLIC_BASEADDR) should be defined in <Device.h> */
+#error "__ECLIC_BASEADDR is not defined, please check!"
+#endif
+
+#ifndef __ECLIC_INTCTLBITS
+/* Define __ECLIC_INTCTLBITS to get via ECLIC->INFO if not defined */
+#define __ECLIC_INTCTLBITS                  (__ECLIC_GetInfoCtlbits())
+#endif
+
+/* ECLIC Memory mapping of Device */
+#define ECLIC_BASE                          __ECLIC_BASEADDR                            /*!< ECLIC Base Address */
+#define ECLIC                               ((CLIC_Type *) ECLIC_BASE)                  /*!< CLIC configuration struct */
+
+/** @} */ /* end of group NMSIS_Core_ECLIC_Registers */
+
+/* ##########################   ECLIC functions  #################################### */
+/**
+ * \defgroup   NMSIS_Core_IntExc        Interrupts and Exceptions
+ * \brief Functions that manage interrupts and exceptions via the ECLIC.
+ *
+ * @{
+ */
+
+/**
+ * \brief  Definition of IRQn numbers
+ * \details
+ * The core interrupt enumeration names for IRQn values are defined in the file <b><Device>.h</b>.
+ * - Interrupt ID(IRQn) from 0 to 18 are reserved for core internal interrupts.
+ * - Interrupt ID(IRQn) start from 19 represent device-specific external interrupts.
+ * - The first device-specific interrupt has the IRQn value 19.
+ *
+ * The table below describes the core interrupt names and their availability in various Nuclei Cores.
+ */
+/* The following enum IRQn definition in this file
+ * is only used for doxygen documentation generation,
+ * The <Device>.h is the real file to define it by vendor
+ */
+#if defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__)
+typedef enum IRQn {
+    /* ========= Nuclei N/NX Core Specific Interrupt Numbers  =========== */
+    /* Core Internal Interrupt IRQn definitions */
+    Reserved0_IRQn            =   0,              /*!<  Internal reserved */
+    Reserved1_IRQn            =   1,              /*!<  Internal reserved */
+    Reserved2_IRQn            =   2,              /*!<  Internal reserved */
+    SysTimerSW_IRQn           =   3,              /*!<  System Timer SW interrupt */
+    Reserved3_IRQn            =   4,              /*!<  Internal reserved */
+    Reserved4_IRQn            =   5,              /*!<  Internal reserved */
+    Reserved5_IRQn            =   6,              /*!<  Internal reserved */
+    SysTimer_IRQn             =   7,              /*!<  System Timer Interrupt */
+    Reserved6_IRQn            =   8,              /*!<  Internal reserved */
+    Reserved7_IRQn            =   9,              /*!<  Internal reserved */
+    Reserved8_IRQn            =  10,              /*!<  Internal reserved */
+    Reserved9_IRQn            =  11,              /*!<  Internal reserved */
+    Reserved10_IRQn           =  12,              /*!<  Internal reserved */
+    Reserved11_IRQn           =  13,              /*!<  Internal reserved */
+    Reserved12_IRQn           =  14,              /*!<  Internal reserved */
+    Reserved13_IRQn           =  15,              /*!<  Internal reserved */
+    Reserved14_IRQn           =  16,              /*!<  Internal reserved */
+    Reserved15_IRQn           =  17,              /*!<  Internal reserved */
+    Reserved16_IRQn           =  18,              /*!<  Internal reserved */
+
+    /* ========= Device Specific Interrupt Numbers  =================== */
+    /* ToDo: add here your device specific external interrupt numbers.
+     * 19~max(NUM_INTERRUPT, 1023) is reserved number for user.
+     * Maxmum interrupt supported could get from clicinfo.NUM_INTERRUPT.
+     * According the interrupt handlers defined in startup_Device.S
+     * eg.: Interrupt for Timer#1       eclic_tim1_handler   ->   TIM1_IRQn */
+    FirstDeviceSpecificInterrupt_IRQn    = 19,    /*!< First Device Specific Interrupt */
+    SOC_INT_MAX,                                  /*!< Number of total interrupts */
+} IRQn_Type;
+#endif /* __ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__ */
+
+#ifdef NMSIS_ECLIC_VIRTUAL
+    #ifndef NMSIS_ECLIC_VIRTUAL_HEADER_FILE
+        #define NMSIS_ECLIC_VIRTUAL_HEADER_FILE "nmsis_eclic_virtual.h"
+    #endif
+    #include NMSIS_ECLIC_VIRTUAL_HEADER_FILE
+#else
+    #define ECLIC_SetCfgNlbits            __ECLIC_SetCfgNlbits
+    #define ECLIC_GetCfgNlbits            __ECLIC_GetCfgNlbits
+    #define ECLIC_GetInfoVer              __ECLIC_GetInfoVer
+    #define ECLIC_GetInfoCtlbits          __ECLIC_GetInfoCtlbits
+    #define ECLIC_GetInfoNum              __ECLIC_GetInfoNum
+    #define ECLIC_SetMth                  __ECLIC_SetMth
+    #define ECLIC_GetMth                  __ECLIC_GetMth
+    #define ECLIC_EnableIRQ               __ECLIC_EnableIRQ
+    #define ECLIC_GetEnableIRQ            __ECLIC_GetEnableIRQ
+    #define ECLIC_DisableIRQ              __ECLIC_DisableIRQ
+    #define ECLIC_SetPendingIRQ           __ECLIC_SetPendingIRQ
+    #define ECLIC_GetPendingIRQ           __ECLIC_GetPendingIRQ
+    #define ECLIC_ClearPendingIRQ         __ECLIC_ClearPendingIRQ
+    #define ECLIC_SetTrigIRQ              __ECLIC_SetTrigIRQ
+    #define ECLIC_GetTrigIRQ              __ECLIC_GetTrigIRQ
+    #define ECLIC_SetShvIRQ               __ECLIC_SetShvIRQ
+    #define ECLIC_GetShvIRQ               __ECLIC_GetShvIRQ
+    #define ECLIC_SetCtrlIRQ              __ECLIC_SetCtrlIRQ
+    #define ECLIC_GetCtrlIRQ              __ECLIC_GetCtrlIRQ
+    #define ECLIC_SetLevelIRQ             __ECLIC_SetLevelIRQ
+    #define ECLIC_GetLevelIRQ             __ECLIC_GetLevelIRQ
+    #define ECLIC_SetPriorityIRQ          __ECLIC_SetPriorityIRQ
+    #define ECLIC_GetPriorityIRQ          __ECLIC_GetPriorityIRQ
+
+#endif /* NMSIS_ECLIC_VIRTUAL */
+
+#ifdef NMSIS_VECTAB_VIRTUAL
+    #ifndef NMSIS_VECTAB_VIRTUAL_HEADER_FILE
+        #define NMSIS_VECTAB_VIRTUAL_HEADER_FILE "nmsis_vectab_virtual.h"
+    #endif
+    #include NMSIS_VECTAB_VIRTUAL_HEADER_FILE
+#else
+    #define ECLIC_SetVector              __ECLIC_SetVector
+    #define ECLIC_GetVector              __ECLIC_GetVector
+#endif  /* (NMSIS_VECTAB_VIRTUAL) */
+
+/**
+ * \brief  Set nlbits value
+ * \details
+ * This function set the nlbits value of CLICCFG register.
+ * \param [in]    nlbits    nlbits value
+ * \remarks
+ * - nlbits is used to set the width of level in the CLICINTCTL[i].
+ * \sa
+ * - \ref ECLIC_GetCfgNlbits
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetCfgNlbits(uint32_t nlbits)
+{
+    ECLIC->CFG &= ~CLIC_CLICCFG_NLBIT_Msk;
+    ECLIC->CFG |= (uint8_t)((nlbits <<CLIC_CLICCFG_NLBIT_Pos) & CLIC_CLICCFG_NLBIT_Msk);
+}
+
+/**
+ * \brief  Get nlbits value
+ * \details
+ * This function get the nlbits value of CLICCFG register.
+ * \return   nlbits value of CLICCFG register
+ * \remarks
+ * - nlbits is used to set the width of level in the CLICINTCTL[i].
+ * \sa
+ * - \ref ECLIC_SetCfgNlbits
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetCfgNlbits(void)
+{
+    return ((uint32_t)((ECLIC->CFG & CLIC_CLICCFG_NLBIT_Msk) >> CLIC_CLICCFG_NLBIT_Pos));
+}
+
+/**
+ * \brief  Get the ECLIC version number
+ * \details
+ * This function gets the hardware version information from CLICINFO register.
+ * \return   hardware version number in CLICINFO register.
+ * \remarks
+ * - This function gets harware version information from CLICINFO register.
+ * - Bit 20:17 for architecture version, bit 16:13 for implementation version.
+ * \sa
+ * - \ref ECLIC_GetInfoNum
+*/
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetInfoVer(void)
+{
+    return ((uint32_t)((ECLIC->INFO & CLIC_CLICINFO_VER_Msk) >> CLIC_CLICINFO_VER_Pos));
+}
+
+/**
+ * \brief  Get CLICINTCTLBITS
+ * \details
+ * This function gets CLICINTCTLBITS from CLICINFO register.
+ * \return  CLICINTCTLBITS from CLICINFO register.
+ * \remarks
+ * - In the CLICINTCTL[i] registers, with 2 <= CLICINTCTLBITS <= 8.
+ * - The implemented bits are kept left-justified in the most-significant bits of each 8-bit
+ *   CLICINTCTL[I] register, with the lower unimplemented bits treated as hardwired to 1.
+ * \sa
+ * - \ref ECLIC_GetInfoNum
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetInfoCtlbits(void)
+{
+    return ((uint32_t)((ECLIC->INFO & CLIC_CLICINFO_CTLBIT_Msk) >> CLIC_CLICINFO_CTLBIT_Pos));
+}
+
+/**
+ * \brief  Get number of maximum interrupt inputs supported
+ * \details
+ * This function gets number of maximum interrupt inputs supported from CLICINFO register.
+ * \return  number of maximum interrupt inputs supported from CLICINFO register.
+ * \remarks
+ * - This function gets number of maximum interrupt inputs supported from CLICINFO register.
+ * - The num_interrupt field specifies the actual number of maximum interrupt inputs supported in this implementation.
+ * \sa
+ * - \ref ECLIC_GetInfoCtlbits
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetInfoNum(void)
+{
+    return ((uint32_t)((ECLIC->INFO & CLIC_CLICINFO_NUM_Msk) >> CLIC_CLICINFO_NUM_Pos));
+}
+
+/**
+ * \brief  Set Machine Mode Interrupt Level Threshold
+ * \details
+ * This function sets machine mode interrupt level threshold.
+ * \param [in]  mth       Interrupt Level Threshold.
+ * \sa
+ * - \ref ECLIC_GetMth
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetMth(uint8_t mth)
+{
+    ECLIC->MTH = mth;
+}
+
+/**
+ * \brief  Get Machine Mode Interrupt Level Threshold
+ * \details
+ * This function gets machine mode interrupt level threshold.
+ * \return       Interrupt Level Threshold.
+ * \sa
+ * - \ref ECLIC_SetMth
+ */
+__STATIC_FORCEINLINE uint8_t __ECLIC_GetMth(void)
+{
+    return (ECLIC->MTH);
+}
+
+
+/**
+ * \brief  Enable a specific interrupt
+ * \details
+ * This function enables the specific interrupt \em IRQn.
+ * \param [in]  IRQn  Interrupt number
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_DisableIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_EnableIRQ(IRQn_Type IRQn)
+{
+    ECLIC->CTRL[IRQn].INTIE |= CLIC_INTIE_IE_Msk;
+}
+
+/**
+ * \brief  Get a specific interrupt enable status
+ * \details
+ * This function returns the interrupt enable status for the specific interrupt \em IRQn.
+ * \param [in]  IRQn  Interrupt number
+ * \returns
+ * - 0  Interrupt is not enabled
+ * - 1  Interrupt is pending
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_EnableIRQ
+ * - \ref ECLIC_DisableIRQ
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetEnableIRQ(IRQn_Type IRQn)
+{
+    return((uint32_t) (ECLIC->CTRL[IRQn].INTIE) & CLIC_INTIE_IE_Msk);
+}
+
+/**
+ * \brief  Disable a specific interrupt
+ * \details
+ * This function disables the specific interrupt \em IRQn.
+ * \param [in]  IRQn  Number of the external interrupt to disable
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_EnableIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_DisableIRQ(IRQn_Type IRQn)
+{
+    ECLIC->CTRL[IRQn].INTIE &= ~CLIC_INTIE_IE_Msk;
+}
+
+/**
+ * \brief  Get the pending specific interrupt
+ * \details
+ * This function returns the pending status of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \returns
+ * - 0  Interrupt is not pending
+ * - 1  Interrupt is pending
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetPendingIRQ
+ * - \ref ECLIC_ClearPendingIRQ
+ */
+__STATIC_FORCEINLINE int32_t __ECLIC_GetPendingIRQ(IRQn_Type IRQn)
+{
+    return((uint32_t)(ECLIC->CTRL[IRQn].INTIP) & CLIC_INTIP_IP_Msk);
+}
+
+/**
+ * \brief  Set a specific interrupt to pending
+ * \details
+ * This function sets the pending bit for the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_GetPendingIRQ
+ * - \ref ECLIC_ClearPendingIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetPendingIRQ(IRQn_Type IRQn)
+{
+    ECLIC->CTRL[IRQn].INTIP |= CLIC_INTIP_IP_Msk;
+}
+
+/**
+ * \brief  Clear a specific interrupt from pending
+ * \details
+ * This function removes the pending state of the specific interrupt \em IRQn.
+ * \em IRQn cannot be a negative number.
+ * \param [in]      IRQn  Interrupt number
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetPendingIRQ
+ * - \ref ECLIC_GetPendingIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_ClearPendingIRQ(IRQn_Type IRQn)
+{
+    ECLIC->CTRL[IRQn].INTIP &= ~ CLIC_INTIP_IP_Msk;
+}
+
+/**
+ * \brief  Set trigger mode and polarity for a specific interrupt
+ * \details
+ * This function set trigger mode and polarity of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      trig
+ *                   - 00  level trigger, \ref ECLIC_LEVEL_TRIGGER
+ *                   - 01  positive edge trigger, \ref ECLIC_POSTIVE_EDGE_TRIGGER
+ *                   - 02  level trigger, \ref ECLIC_LEVEL_TRIGGER
+ *                   - 03  negative edge trigger, \ref ECLIC_NEGTIVE_EDGE_TRIGGER
+ * \remarks
+ * - IRQn must not be negative.
+ *
+ * \sa
+ * - \ref ECLIC_GetTrigIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetTrigIRQ(IRQn_Type IRQn, uint32_t trig)
+{
+    ECLIC->CTRL[IRQn].INTATTR &= ~CLIC_INTATTR_TRIG_Msk;
+    ECLIC->CTRL[IRQn].INTATTR |= (uint8_t)(trig<<CLIC_INTATTR_TRIG_Pos);
+}
+
+/**
+ * \brief  Get trigger mode and polarity for a specific interrupt
+ * \details
+ * This function get trigger mode and polarity of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return
+ *                 - 00  level trigger, \ref ECLIC_LEVEL_TRIGGER
+ *                 - 01  positive edge trigger, \ref ECLIC_POSTIVE_EDGE_TRIGGER
+ *                 - 02  level trigger, \ref ECLIC_LEVEL_TRIGGER
+ *                 - 03  negative edge trigger, \ref ECLIC_NEGTIVE_EDGE_TRIGGER
+ * \remarks
+ *     - IRQn must not be negative.
+ * \sa
+ *     - \ref ECLIC_SetTrigIRQ
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetTrigIRQ(IRQn_Type IRQn)
+{
+    return ((int32_t)(((ECLIC->CTRL[IRQn].INTATTR) & CLIC_INTATTR_TRIG_Msk)>>CLIC_INTATTR_TRIG_Pos));
+}
+
+/**
+ * \brief  Set interrupt working mode for a specific interrupt
+ * \details
+ * This function set selective hardware vector or non-vector working mode of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      shv
+ *                        - 0  non-vector mode, \ref ECLIC_NON_VECTOR_INTERRUPT
+ *                        - 1  vector mode, \ref ECLIC_VECTOR_INTERRUPT
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_GetShvIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetShvIRQ(IRQn_Type IRQn, uint32_t shv)
+{
+    ECLIC->CTRL[IRQn].INTATTR &= ~CLIC_INTATTR_SHV_Msk;
+    ECLIC->CTRL[IRQn].INTATTR |= (uint8_t)(shv<<CLIC_INTATTR_SHV_Pos);
+}
+
+/**
+ * \brief  Get interrupt working mode for a specific interrupt
+ * \details
+ * This function get selective hardware vector or non-vector working mode of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return       shv
+ *                        - 0  non-vector mode, \ref ECLIC_NON_VECTOR_INTERRUPT
+ *                        - 1  vector mode, \ref ECLIC_VECTOR_INTERRUPT
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetShvIRQ
+ */
+__STATIC_FORCEINLINE uint32_t __ECLIC_GetShvIRQ(IRQn_Type IRQn)
+{
+    return ((int32_t)(((ECLIC->CTRL[IRQn].INTATTR) & CLIC_INTATTR_SHV_Msk)>>CLIC_INTATTR_SHV_Pos));
+}
+
+/**
+ * \brief  Modify ECLIC Interrupt Input Control Register for a specific interrupt
+ * \details
+ * This function modify ECLIC Interrupt Input Control(CLICINTCTL[i]) register of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      intctrl  Set value for CLICINTCTL[i] register
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_GetCtrlIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetCtrlIRQ(IRQn_Type IRQn, uint8_t intctrl)
+{
+    ECLIC->CTRL[IRQn].INTCTRL = intctrl;
+}
+
+/**
+ * \brief  Get ECLIC Interrupt Input Control Register value for a specific interrupt
+ * \details
+ * This function modify ECLIC Interrupt Input Control register of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return       value of ECLIC Interrupt Input Control register
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetCtrlIRQ
+ */
+__STATIC_FORCEINLINE uint8_t __ECLIC_GetCtrlIRQ(IRQn_Type IRQn)
+{
+    return (ECLIC->CTRL[IRQn].INTCTRL);
+}
+
+/**
+ * \brief  Set ECLIC Interrupt level of a specific interrupt
+ * \details
+ * This function set interrupt level of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      lvl_abs   Interrupt level
+ * \remarks
+ * - IRQn must not be negative.
+ * - If lvl_abs to be set is larger than the max level allowed, it will be force to be max level.
+ * - When you set level value you need use clciinfo.nlbits to get the width of level.
+ *   Then we could know the maximum of level. CLICINTCTLBITS is how many total bits are
+ *   present in the CLICINTCTL register.
+ * \sa
+ * - \ref ECLIC_GetLevelIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetLevelIRQ(IRQn_Type IRQn, uint8_t lvl_abs)
+{
+    uint8_t nlbits = __ECLIC_GetCfgNlbits();
+    uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS;
+
+    if (nlbits == 0) {
+        return;
+    }
+
+    if (nlbits > intctlbits) {
+        nlbits = intctlbits;
+    }
+    uint8_t maxlvl = ((1 << nlbits) - 1);
+    if (lvl_abs > maxlvl) {
+        lvl_abs = maxlvl;
+    }
+    uint8_t lvl = lvl_abs << (ECLIC_MAX_NLBITS - nlbits);
+    uint8_t cur_ctrl = __ECLIC_GetCtrlIRQ(IRQn);
+    cur_ctrl = cur_ctrl << nlbits;
+    cur_ctrl = cur_ctrl >> nlbits;
+    __ECLIC_SetCtrlIRQ(IRQn, (cur_ctrl | lvl));
+}
+
+/**
+ * \brief  Get ECLIC Interrupt level of a specific interrupt
+ * \details
+ * This function get interrupt level of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return         Interrupt level
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetLevelIRQ
+ */
+__STATIC_FORCEINLINE uint8_t __ECLIC_GetLevelIRQ(IRQn_Type IRQn)
+{
+    uint8_t nlbits = __ECLIC_GetCfgNlbits();
+    uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS;
+
+    if (nlbits == 0) {
+        return 0;
+    }
+
+    if (nlbits > intctlbits) {
+        nlbits = intctlbits;
+    }
+    uint8_t intctrl = __ECLIC_GetCtrlIRQ(IRQn);
+    uint8_t lvl_abs = intctrl >> (ECLIC_MAX_NLBITS - nlbits);
+    return lvl_abs;
+}
+
+/**
+ * \brief  Get ECLIC Interrupt priority of a specific interrupt
+ * \details
+ * This function get interrupt priority of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      pri   Interrupt priority
+ * \remarks
+ * - IRQn must not be negative.
+ * - If pri to be set is larger than the max priority allowed, it will be force to be max priority.
+ * - Priority width is CLICINTCTLBITS minus clciinfo.nlbits if clciinfo.nlbits
+ *   is less than CLICINTCTLBITS. Otherwise priority width is 0.
+ * \sa
+ * - \ref ECLIC_GetPriorityIRQ
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetPriorityIRQ(IRQn_Type IRQn, uint8_t pri)
+{
+    uint8_t nlbits = __ECLIC_GetCfgNlbits();
+    uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS;
+    if (nlbits < intctlbits) {
+        uint8_t maxpri = ((1 << (intctlbits - nlbits)) - 1);
+        if (pri > maxpri) {
+            pri = maxpri;
+        }
+        pri = pri << (ECLIC_MAX_NLBITS - intctlbits);
+        uint8_t mask = ((uint8_t)(-1)) >> intctlbits;
+        pri = pri | mask;
+        uint8_t cur_ctrl = __ECLIC_GetCtrlIRQ(IRQn);
+        cur_ctrl = cur_ctrl >> (ECLIC_MAX_NLBITS - nlbits);
+        cur_ctrl = cur_ctrl << (ECLIC_MAX_NLBITS - nlbits);
+        __ECLIC_SetCtrlIRQ(IRQn, (cur_ctrl | pri));
+    }
+}
+
+/**
+ * \brief  Get ECLIC Interrupt priority of a specific interrupt
+ * \details
+ * This function get interrupt priority of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return   Interrupt priority
+ * \remarks
+ * - IRQn must not be negative.
+ * \sa
+ * - \ref ECLIC_SetPriorityIRQ
+ */
+__STATIC_FORCEINLINE uint8_t __ECLIC_GetPriorityIRQ(IRQn_Type IRQn)
+{
+    uint8_t nlbits = __ECLIC_GetCfgNlbits();
+    uint8_t intctlbits = (uint8_t)__ECLIC_INTCTLBITS;
+    if (nlbits < intctlbits) {
+        uint8_t cur_ctrl = __ECLIC_GetCtrlIRQ(IRQn);
+        uint8_t pri = cur_ctrl << nlbits;
+        pri = pri >> nlbits;
+        pri = pri >> (ECLIC_MAX_NLBITS - intctlbits);
+        return pri;
+    } else {
+        return 0;
+    }
+}
+
+/**
+ * \brief  Set Interrupt Vector of a specific interrupt
+ * \details
+ * This function set interrupt handler address of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \param [in]      vector   Interrupt handler address
+ * \remarks
+ * - IRQn must not be negative.
+ * - You can set the \ref CSR_CSR_MTVT to set interrupt vector table entry address.
+ * - If your vector table is placed in readonly section, the vector for IRQn will not be modified.
+ *   For this case, you need to use the correct irq handler name defined in your vector table as
+ *   your irq handler function name.
+ * - This function will only work correctly when the vector table is placed in an read-write enabled section.
+ * \sa
+ * - \ref ECLIC_GetVector
+ */
+__STATIC_FORCEINLINE void __ECLIC_SetVector(IRQn_Type IRQn, rv_csr_t vector)
+{
+#if __RISCV_XLEN == 32
+    volatile uint32_t vec_base;
+    vec_base = ((uint32_t)__RV_CSR_READ(CSR_MTVT));
+    (* (unsigned long *) (vec_base + ((int32_t)IRQn) * 4)) = vector;
+#elif __RISCV_XLEN == 64
+    volatile uint64_t vec_base;
+    vec_base = ((uint64_t)__RV_CSR_READ(CSR_MTVT));
+    (* (unsigned long *) (vec_base + ((int32_t)IRQn) * 8)) = vector;
+#else // TODO Need cover for XLEN=128 case in future
+    volatile uint64_t vec_base;
+    vec_base = ((uint64_t)__RV_CSR_READ(CSR_MTVT));
+    (* (unsigned long *) (vec_base + ((int32_t)IRQn) * 8)) = vector;
+#endif
+}
+
+/**
+ * \brief  Get Interrupt Vector of a specific interrupt
+ * \details
+ * This function get interrupt handler address of the specific interrupt \em IRQn.
+ * \param [in]      IRQn  Interrupt number
+ * \return        Interrupt handler address
+ * \remarks
+ * - IRQn must not be negative.
+ * - You can read \ref CSR_CSR_MTVT to get interrupt vector table entry address.
+ * \sa
+ *     - \ref ECLIC_SetVector
+ */
+__STATIC_FORCEINLINE rv_csr_t __ECLIC_GetVector(IRQn_Type IRQn)
+{
+#if __RISCV_XLEN == 32
+    return (*(uint32_t *)(__RV_CSR_READ(CSR_MTVT)+IRQn*4));
+#elif __RISCV_XLEN == 64
+    return (*(uint64_t *)(__RV_CSR_READ(CSR_MTVT)+IRQn*8));
+#else // TODO Need cover for XLEN=128 case in future
+    return (*(uint64_t *)(__RV_CSR_READ(CSR_MTVT)+IRQn*8));
+#endif
+}
+
+/**
+ * \brief  Set Exception entry address
+ * \details
+ * This function set exception handler address to 'CSR_MTVEC'.
+ * \param [in]      addr  Exception handler address
+ * \remarks
+ * - This function use to set exception handler address to 'CSR_MTVEC'. Address is 4 bytes align.
+ * \sa
+ * - \ref __get_exc_entry
+ */
+__STATIC_FORCEINLINE void __set_exc_entry(rv_csr_t addr)
+{
+    addr &= (rv_csr_t)(~0x3F);
+    addr |= ECLIC_MODE_MTVEC_Msk;
+    __RV_CSR_WRITE(CSR_MTVEC, addr);
+}
+
+/**
+ * \brief  Get Exception entry address
+ * \details
+ * This function get exception handler address from 'CSR_MTVEC'.
+ * \return       Exception handler address
+ * \remarks
+ * - This function use to get exception handler address from 'CSR_MTVEC'. Address is 4 bytes align
+ * \sa
+ * - \ref __set_exc_entry
+ */
+__STATIC_FORCEINLINE rv_csr_t __get_exc_entry(void)
+{
+    unsigned long addr = __RV_CSR_READ(CSR_MTVEC);
+    return (addr & ~ECLIC_MODE_MTVEC_Msk);
+}
+
+/**
+ * \brief  Set Non-vector interrupt entry address
+ * \details
+ * This function set Non-vector interrupt address.
+ * \param [in]      addr  Non-vector interrupt entry address
+ * \remarks
+ * - This function use to set non-vector interrupt entry address to 'CSR_MTVT2' if
+ * - CSR_MTVT2 bit0 is 1. If 'CSR_MTVT2' bit0 is 0 then set address to 'CSR_MTVEC'
+ * \sa
+ * - \ref __get_nonvec_entry
+ */
+__STATIC_FORCEINLINE void __set_nonvec_entry(rv_csr_t addr)
+{
+    if (__RV_CSR_READ(CSR_MTVT2) & 0x1){
+        __RV_CSR_WRITE(CSR_MTVT2, addr | 0x01);
+    } else {
+        addr &= (rv_csr_t)(~0x3F);
+        addr |= ECLIC_MODE_MTVEC_Msk;
+        __RV_CSR_WRITE(CSR_MTVEC, addr);
+    }
+}
+
+/**
+ * \brief  Get Non-vector interrupt entry address
+ * \details
+ * This function get Non-vector interrupt address.
+ * \return      Non-vector interrupt handler address
+ * \remarks
+ * - This function use to get non-vector interrupt entry address from 'CSR_MTVT2' if
+ * - CSR_MTVT2 bit0 is 1. If 'CSR_MTVT2' bit0 is 0 then get address from 'CSR_MTVEC'.
+ * \sa
+ * - \ref __set_nonvec_entry
+ */
+__STATIC_FORCEINLINE rv_csr_t __get_nonvec_entry(void)
+{
+    if (__RV_CSR_READ(CSR_MTVT2) & 0x1) {
+        return __RV_CSR_READ(CSR_MTVT2) & (~(rv_csr_t)(0x1));
+    } else {
+        rv_csr_t addr = __RV_CSR_READ(CSR_MTVEC);
+        return (addr & ~ECLIC_MODE_MTVEC_Msk);
+    }
+}
+
+/**
+ * \brief  Get NMI interrupt entry from 'CSR_MNVEC'
+ * \details
+ * This function get NMI interrupt address from 'CSR_MNVEC'.
+ * \return      NMI interrupt handler address
+ * \remarks
+ * - This function use to get NMI interrupt handler address from 'CSR_MNVEC'. If CSR_MMISC_CTL[9] = 1 'CSR_MNVEC'
+ * - will be equal as mtvec. If CSR_MMISC_CTL[9] = 0 'CSR_MNVEC' will be equal as reset vector.
+ * - NMI entry is defined via \ref CSR_MMISC_CTL, writing to \ref CSR_MNVEC will be ignored.
+ */
+__STATIC_FORCEINLINE rv_csr_t __get_nmi_entry(void)
+{
+    return __RV_CSR_READ(CSR_MNVEC);
+}
+
+/**
+ * \brief   Save necessary CSRs into variables for vector interrupt nesting
+ * \details
+ * This macro is used to declare variables which are used for saving
+ * CSRs(MCAUSE, MEPC, MSUB), and it will read these CSR content into
+ * these variables, it need to be used in a vector-interrupt if nesting
+ * is required.
+ * \remarks
+ * - Interrupt will be enabled after this macro is called
+ * - It need to be used together with \ref RESTORE_IRQ_CSR_CONTEXT
+ * - Don't use variable names __mcause, __mpec, __msubm in your ISR code
+ * - If you want to enable interrupt nesting feature for vector interrupt,
+ * you can do it like this:
+ * \code
+ * // __INTERRUPT attribute will generates function entry and exit sequences suitable
+ * // for use in an interrupt handler when this attribute is present
+ * __INTERRUPT void eclic_mtip_handler(void)
+ * {
+ *     // Must call this to save CSRs
+ *     SAVE_IRQ_CSR_CONTEXT();
+ *     // !!!Interrupt is enabled here!!!
+ *     // !!!Higher priority interrupt could nest it!!!
+ *
+ *     // put you own interrupt handling code here
+ *
+ *     // Must call this to restore CSRs
+ *     RESTORE_IRQ_CSR_CONTEXT();
+ * }
+ * \endcode
+ */
+#define SAVE_IRQ_CSR_CONTEXT()                                              \
+        rv_csr_t __mcause = __RV_CSR_READ(CSR_MCAUSE);                      \
+        rv_csr_t __mepc = __RV_CSR_READ(CSR_MEPC);                          \
+        rv_csr_t __msubm = __RV_CSR_READ(CSR_MSUBM);                        \
+        __enable_irq();
+
+/**
+ * \brief   Restore necessary CSRs from variables for vector interrupt nesting
+ * \details
+ * This macro is used restore CSRs(MCAUSE, MEPC, MSUB) from pre-defined variables
+ * in \ref SAVE_IRQ_CSR_CONTEXT macro.
+ * \remarks
+ * - Interrupt will be disabled after this macro is called
+ * - It need to be used together with \ref SAVE_IRQ_CSR_CONTEXT
+ */
+#define RESTORE_IRQ_CSR_CONTEXT()                                           \
+        __disable_irq();                                                    \
+        __RV_CSR_WRITE(CSR_MSUBM, __msubm);                                 \
+        __RV_CSR_WRITE(CSR_MEPC, __mepc);                                   \
+        __RV_CSR_WRITE(CSR_MCAUSE, __mcause);
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_IntExc */
+
+#endif /* defined(__ECLIC_PRESENT) && (__ECLIC_PRESENT == 1) */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /** __CORE_FEATURE_ECLIC__ */
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_fpu.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_fpu.h
new file mode 100644
index 000000000..c9e13b79d
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_fpu.h
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_FPU_H__
+#define __CORE_FEATURE_FPU_H__
+/*!
+ * @file     core_feature_fpu.h
+ * @brief    FPU feature API header file for Nuclei N/NX Core
+ */
+/*
+ * FPU Feature Configuration Macro:
+ * 1. __FPU_PRESENT:  Define whether Floating Point Unit(FPU) is present or not
+ *   * 0: Not present
+ *   * 1: Single precision FPU present, __RISCV_FLEN == 32
+ *   * 2: Double precision FPU present, __RISCV_FLEN == 64
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* ===== FPU Operations ===== */
+/**
+ * \defgroup NMSIS_Core_FPU_Functions   FPU Functions
+ * \ingroup  NMSIS_Core
+ * \brief    Functions that related to the RISC-V FPU (F and D extension).
+ * \details
+ *
+ * Nuclei provided floating point unit by RISC-V F and D extension.
+ * * `F extension` adds single-precision floating-point computational
+ * instructions compliant with the IEEE 754-2008 arithmetic standard, __RISCV_FLEN = 32.
+ *   The F extension adds 32 floating-point registers, f0-f31, each 32 bits wide,
+ *   and a floating-point control and status register fcsr, which contains the
+ *   operating mode and exception status of the floating-point unit.
+ * * `D extension` adds double-precision floating-point computational instructions
+ * compliant with the IEEE 754-2008 arithmetic standard.
+ *   The D extension widens the 32 floating-point registers, f0-f31, to 64 bits, __RISCV_FLEN = 64
+ *   @{
+ */
+#if defined(__FPU_PRESENT) && (__FPU_PRESENT > 0)
+
+#if __FPU_PRESENT == 1
+  /** \brief Refer to the width of the floating point register in bits(either 32 or 64) */
+  #define __RISCV_FLEN          32
+#elif __FPU_PRESENT == 2
+  #define __RISCV_FLEN          64
+#else
+  #define __RISCV_FLEN          __riscv_flen
+#endif /* __FPU_PRESENT == 1 */
+
+/** \brief Get FCSR CSR Register */
+#define __get_FCSR()            __RV_CSR_READ(CSR_FCSR)
+/** \brief Set FCSR CSR Register with val */
+#define __set_FCSR(val)         __RV_CSR_WRITE(CSR_FCSR, (val))
+/** \brief Get FRM CSR Register */
+#define __get_FRM()             __RV_CSR_READ(CSR_FRM)
+/** \brief Set FRM CSR Register with val */
+#define __set_FRM(val)          __RV_CSR_WRITE(CSR_FRM, (val))
+/** \brief Get FFLAGS CSR Register */
+#define __get_FFLAGS()          __RV_CSR_READ(CSR_FFLAGS)
+/** \brief Set FFLAGS CSR Register with val */
+#define __set_FFLAGS(val)       __RV_CSR_WRITE(CSR_FFLAGS, (val))
+
+/** \brief Enable FPU Unit */
+#define __enable_FPU()          __RV_CSR_SET(CSR_MSTATUS, MSTATUS_FS)
+/**
+ * \brief Disable FPU Unit
+ * \details
+ * * We can save power by disable FPU Unit.
+ * * When FPU Unit is disabled, any access to FPU related CSR registers
+ * and FPU instructions will cause illegal Instuction Exception.
+ * */
+#define __disable_FPU()         __RV_CSR_CLEAR(CSR_MSTATUS, MSTATUS_FS)
+
+
+/**
+ * \brief   Load a single-precision value from memory into float point register freg using flw instruction
+ * \details The FLW instruction loads a single-precision floating point value from memory
+ * address (addr + ofs) into floating point register freg(f0-f31)
+ * \param [in]    freg   The floating point register, eg. FREG(0), f0
+ * \param [in]    addr   The memory base address, 4 byte aligned required
+ * \param [in]    ofs    a 12-bit immediate signed byte offset value, should be an const value
+ * \remarks
+ * * FLW and FSW operations need to make sure the address is 4 bytes aligned,
+ *   otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned)
+ * * FLW and FSW do not modify the bits being transferred; in particular, the payloads of non-canonical
+ * NaNs are preserved
+ *
+ */
+#define __RV_FLW(freg, addr, ofs)                              \
+    ({                                                         \
+        register rv_csr_t __addr = (rv_csr_t)(addr);           \
+        __ASM volatile("flw " STRINGIFY(freg) ", %0(%1)  "     \
+                     : : "I"(ofs), "r"(__addr)                 \
+                     : "memory");                              \
+    })
+
+/**
+ * \brief   Store a single-precision value from float point freg into memory using fsw instruction
+ * \details The FSW instruction stores a single-precision value from floating point register to memory
+ * \param [in]    freg   The floating point register(f0-f31), eg. FREG(0), f0
+ * \param [in]    addr   The memory base address, 4 byte aligned required
+ * \param [in]    ofs    a 12-bit immediate signed byte offset value, should be an const value
+ * \remarks
+ * * FLW and FSW operations need to make sure the address is 4 bytes aligned,
+ *   otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned)
+ * * FLW and FSW do not modify the bits being transferred; in particular, the payloads of non-canonical
+ * NaNs are preserved
+ *
+ */
+#define __RV_FSW(freg, addr, ofs)                              \
+    ({                                                         \
+        register rv_csr_t __addr = (rv_csr_t)(addr);           \
+        __ASM volatile("fsw " STRINGIFY(freg) ", %0(%1)  "     \
+                     : : "I"(ofs), "r"(__addr)                 \
+                     : "memory");                              \
+    })
+
+/**
+ * \brief   Load a double-precision value from memory into float point register freg using fld instruction
+ * \details The FLD instruction loads a double-precision floating point value from memory
+ * address (addr + ofs) into floating point register freg(f0-f31)
+ * \param [in]    freg   The floating point register, eg. FREG(0), f0
+ * \param [in]    addr   The memory base address, 8 byte aligned required
+ * \param [in]    ofs    a 12-bit immediate signed byte offset value, should be an const value
+ * \attention
+ * * Function only available for double precision floating point unit, FLEN = 64
+ * \remarks
+ * * FLD and FSD operations need to make sure the address is 8 bytes aligned,
+ *   otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned)
+ * * FLD and FSD do not modify the bits being transferred; in particular, the payloads of non-canonical
+ * NaNs are preserved.
+ */
+#define __RV_FLD(freg, addr, ofs)                              \
+    ({                                                         \
+        register rv_csr_t __addr = (rv_csr_t)(addr);           \
+        __ASM volatile("fld " STRINGIFY(freg) ", %0(%1)  "     \
+                     : : "I"(ofs), "r"(__addr)                 \
+                     : "memory");                              \
+    })
+
+/**
+ * \brief   Store a double-precision value from float point freg into memory using fsd instruction
+ * \details The FSD instruction stores double-precision value from floating point register to memory
+ * \param [in]    freg   The floating point register(f0-f31), eg. FREG(0), f0
+ * \param [in]    addr   The memory base address, 8 byte aligned required
+ * \param [in]    ofs    a 12-bit immediate signed byte offset value, should be an const value
+ * \attention
+ * * Function only available for double precision floating point unit, FLEN = 64
+ * \remarks
+ * * FLD and FSD operations need to make sure the address is 8 bytes aligned,
+ *   otherwise it will cause exception code 4(Load address misaligned) or 6 (Store/AMO address misaligned)
+ * * FLD and FSD do not modify the bits being transferred; in particular, the payloads of non-canonical
+ * NaNs are preserved.
+ *
+ */
+#define __RV_FSD(freg, addr, ofs)                              \
+    ({                                                         \
+        register rv_csr_t __addr = (rv_csr_t)(addr);           \
+        __ASM volatile("fsd " STRINGIFY(freg) ", %0(%1)  "     \
+                     : : "I"(ofs), "r"(__addr)                 \
+                     : "memory");                              \
+    })
+
+/**
+ * \def __RV_FLOAD
+ * \brief   Load a float point value from memory into float point register freg using flw/fld instruction
+ * \details
+ * * For Single-Precison Floating-Point Mode(__FPU_PRESENT == 1, __RISCV_FLEN == 32):
+ *   It will call \ref __RV_FLW to load a single-precision floating point value from memory to floating point register
+ * * For Double-Precison Floating-Point Mode(__FPU_PRESENT == 2, __RISCV_FLEN == 64):
+ *   It will call \ref __RV_FLD to load a double-precision floating point value from memory to floating point register
+ *
+ * \attention
+ * Function behaviour is different for __FPU_PRESENT = 1 or 2, please see the real function this macro represent
+ */
+/**
+ * \def __RV_FSTORE
+ * \brief   Store a float value from float point freg into memory using fsw/fsd instruction
+ * \details
+ * * For Single-Precison Floating-Point Mode(__FPU_PRESENT == 1, __RISCV_FLEN == 32):
+ *   It will call \ref __RV_FSW to store floating point register into memory
+ * * For Double-Precison Floating-Point Mode(__FPU_PRESENT == 2, __RISCV_FLEN == 64):
+ *   It will call \ref __RV_FSD to store floating point register into memory
+ *
+ * \attention
+ * Function behaviour is different for __FPU_PRESENT = 1 or 2, please see the real function this macro represent
+ */
+#if __FPU_PRESENT == 1
+#define __RV_FLOAD              __RV_FLW
+#define __RV_FSTORE             __RV_FSW
+/** \brief Type of FPU register, depends on the FLEN defined in RISC-V */
+typedef uint32_t rv_fpu_t;
+#elif __FPU_PRESENT == 2
+#define __RV_FLOAD              __RV_FLD
+#define __RV_FSTORE             __RV_FSD
+/** \brief Type of FPU register, depends on the FLEN defined in RISC-V */
+typedef uint64_t rv_fpu_t;
+#endif /* __FPU_PRESENT == 2 */
+
+/**
+ * \brief   Save FPU context into variables for interrupt nesting
+ * \details
+ * This macro is used to declare variables which are used for saving
+ * FPU context, and it will store the nessary fpu registers into
+ * these variables, it need to be used in a interrupt when in this
+ * interrupt fpu registers are used.
+ * \remarks
+ * - It need to be used together with \ref RESTORE_FPU_CONTEXT
+ * - Don't use variable names __fpu_context in your ISR code
+ * - If you isr code will use fpu registers, and this interrupt is nested.
+ * Then you can do it like this:
+ * \code
+ * void eclic_mtip_handler(void)
+ * {
+ *     // !!!Interrupt is enabled here!!!
+ *     // !!!Higher priority interrupt could nest it!!!
+ *
+ *     // Necessary only when you need to use fpu registers
+ *     // in this isr handler functions
+ *     SAVE_FPU_CONTEXT();
+ *
+ *     // put you own interrupt handling code here
+ *
+ *     // pair of SAVE_FPU_CONTEXT()
+ *     RESTORE_FPU_CONTEXT();
+ * }
+ * \endcode
+ */
+#define SAVE_FPU_CONTEXT()                                                  \
+        rv_fpu_t __fpu_context[20];                                         \
+        __RV_FSTORE(FREG(0),  __fpu_context, 0  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(1),  __fpu_context, 1  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(2),  __fpu_context, 2  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(3),  __fpu_context, 3  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(4),  __fpu_context, 4  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(5),  __fpu_context, 5  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(6),  __fpu_context, 6  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(7),  __fpu_context, 7  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(10), __fpu_context, 8  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(11), __fpu_context, 9  << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(12), __fpu_context, 10 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(13), __fpu_context, 11 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(14), __fpu_context, 12 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(15), __fpu_context, 13 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(16), __fpu_context, 14 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(17), __fpu_context, 15 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(28), __fpu_context, 16 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(29), __fpu_context, 17 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(30), __fpu_context, 18 << LOG_FPREGBYTES);         \
+        __RV_FSTORE(FREG(31), __fpu_context, 19 << LOG_FPREGBYTES);
+
+/**
+ * \brief   Restore necessary fpu registers from variables for interrupt nesting
+ * \details
+ * This macro is used restore necessary fpu registers from pre-defined variables
+ * in \ref SAVE_FPU_CONTEXT macro.
+ * \remarks
+ * - It need to be used together with \ref SAVE_FPU_CONTEXT
+ */
+#define RESTORE_FPU_CONTEXT()                                               \
+        __RV_FLOAD(FREG(0),  __fpu_context, 0  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(1),  __fpu_context, 1  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(2),  __fpu_context, 2  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(3),  __fpu_context, 3  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(4),  __fpu_context, 4  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(5),  __fpu_context, 5  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(6),  __fpu_context, 6  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(7),  __fpu_context, 7  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(10), __fpu_context, 8  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(11), __fpu_context, 9  << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(12), __fpu_context, 10 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(13), __fpu_context, 11 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(14), __fpu_context, 12 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(15), __fpu_context, 13 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(16), __fpu_context, 14 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(17), __fpu_context, 15 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(28), __fpu_context, 16 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(29), __fpu_context, 17 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(30), __fpu_context, 18 << LOG_FPREGBYTES);          \
+        __RV_FLOAD(FREG(31), __fpu_context, 19 << LOG_FPREGBYTES);
+#else
+#define SAVE_FPU_CONTEXT()
+#define RESTORE_FPU_CONTEXT()
+#endif /* __FPU_PRESENT > 0 */
+/** @} */ /* End of Doxygen Group NMSIS_Core_FPU_Functions */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /** __RISCV_EXT_FPU_H__  */
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_pmp.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_pmp.h
new file mode 100644
index 000000000..997dfaee1
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_pmp.h
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_PMP_H__
+#define __CORE_FEATURE_PMP_H__
+/*!
+ * @file     core_feature_pmp.h
+ * @brief    PMP feature API header file for Nuclei N/NX Core
+ */
+/*
+ * PMP Feature Configuration Macro:
+ * 1. __PMP_PRESENT:  Define whether Physical Memory Protection(PMP) is present or not
+ *   * 0: Not present
+ *   * 1: Present
+ * 2. __PMP_ENTRY_NUM:  Define the number of PMP entries, only 8 or 16 is configurable.
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if defined(__PMP_PRESENT) && (__PMP_PRESENT == 1)
+/* ===== PMP Operations ===== */
+/**
+ * \defgroup NMSIS_Core_PMP_Functions   PMP Functions
+ * \ingroup  NMSIS_Core
+ * \brief    Functions that related to the RISCV Phyiscal Memory Protection.
+ * \details
+ * Optional physical memory protection (PMP) unit provides per-hart machine-mode
+ * control registers to allow physical memory access privileges (read, write, execute)
+ * to be specified for each physical memory region.
+ *
+ * The PMP can supports region access control settings as small as four bytes.
+ *
+ *   @{
+ */
+#ifndef __PMP_ENTRY_NUM
+/* numbers of PMP entries(__PMP_ENTRY_NUM) should be defined in <Device.h> */
+#error "__PMP_ENTRY_NUM is not defined, please check!"
+#endif
+
+/**
+ * \brief   Get 8bit PMPxCFG Register by PMP entry index
+ * \details Return the content of the PMPxCFG Register.
+ * \param [in]    idx    PMP region index(0-15)
+ * \return               PMPxCFG Register value
+ */
+__STATIC_INLINE uint8_t __get_PMPxCFG(uint32_t idx)
+{
+    rv_csr_t pmpcfg = 0;
+
+    if (idx >= __PMP_ENTRY_NUM) return 0;
+#if __RISCV_XLEN == 32
+    if (idx < 4) {
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG0);
+    } else if ((idx >=4) && (idx < 8)) {
+        idx -= 4;
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG1);
+    } else if ((idx >=8) && (idx < 12)) {
+        idx -= 8;
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG2);
+    } else {
+        idx -= 12;
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG3);
+    }
+
+    idx = idx << 3;
+    return (uint8_t)((pmpcfg>>idx) & 0xFF);
+#elif __RISCV_XLEN == 64
+    if (idx < 8) {
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG0);
+    } else {
+        idx -= 8;
+        pmpcfg = __RV_CSR_READ(CSR_PMPCFG2);
+    }
+    idx = idx << 3;
+    return (uint8_t)((pmpcfg>>idx) & 0xFF);
+#else
+    // TODO Add RV128 Handling
+    return 0;
+#endif
+}
+
+/**
+ * \brief   Set 8bit PMPxCFG by pmp entry index
+ * \details Set the given pmpxcfg value to the PMPxCFG Register.
+ * \param [in]    idx      PMPx region index(0-15)
+ * \param [in]    pmpxcfg  PMPxCFG register value to set
+ */
+__STATIC_INLINE void __set_PMPxCFG(uint32_t idx, uint8_t pmpxcfg)
+{
+    rv_csr_t pmpcfgx = 0;
+    if (idx >= __PMP_ENTRY_NUM) return;
+
+#if __RISCV_XLEN == 32
+    if (idx < 4) {
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG0);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG0, pmpcfgx);
+    } else if ((idx >=4) && (idx < 8)) {
+        idx -= 4;
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG1);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG1, pmpcfgx);
+    } else if ((idx >=8) && (idx < 12)) {
+        idx -= 8;
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG2);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG2, pmpcfgx);
+    } else {
+        idx -= 12;
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG3);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFUL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG3, pmpcfgx);
+    }
+#elif __RISCV_XLEN == 64
+    if (idx < 8) {
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG0);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFULL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG0, pmpcfgx);
+    } else {
+        idx -= 8;
+        pmpcfgx = __RV_CSR_READ(CSR_PMPCFG2);
+        idx = idx << 3;
+        pmpcfgx = (pmpcfgx & ~(0xFFULL << idx)) | ((rv_csr_t)pmpxcfg << idx);
+        __RV_CSR_WRITE(CSR_PMPCFG2, pmpcfgx);
+    }
+#else
+    // TODO Add RV128 Handling
+#endif
+}
+
+/**
+ * \brief   Get PMPCFGx Register by index
+ * \details Return the content of the PMPCFGx Register.
+ * \param [in]    idx    PMPCFG CSR index(0-3)
+ * \return               PMPCFGx Register value
+ * \remark
+ * - For RV64, only idx = 0 and idx = 2 is allowed.
+ *   pmpcfg0 and pmpcfg2 hold the configurations
+ *   for the 16 PMP entries, pmpcfg1 and pmpcfg3 are illegal
+ * - For RV32, pmpcfg0–pmpcfg3, hold the configurations
+ *   pmp0cfg–pmp15cfg for the 16 PMP entries
+ */
+__STATIC_INLINE rv_csr_t __get_PMPCFGx(uint32_t idx)
+{
+    switch (idx) {
+        case 0: return __RV_CSR_READ(CSR_PMPCFG0);
+        case 1: return __RV_CSR_READ(CSR_PMPCFG1);
+        case 2: return __RV_CSR_READ(CSR_PMPCFG2);
+        case 3: return __RV_CSR_READ(CSR_PMPCFG3);
+        default: return 0;
+    }
+}
+
+/**
+ * \brief   Set PMPCFGx by index
+ * \details Write the given value to the PMPCFGx Register.
+ * \param [in]    idx      PMPCFG CSR index(0-3)
+ * \param [in]    pmpcfg   PMPCFGx Register value to set
+ * \remark
+ * - For RV64, only idx = 0 and idx = 2 is allowed.
+ *   pmpcfg0 and pmpcfg2 hold the configurations
+ *   for the 16 PMP entries, pmpcfg1 and pmpcfg3 are illegal
+ * - For RV32, pmpcfg0–pmpcfg3, hold the configurations
+ *   pmp0cfg–pmp15cfg for the 16 PMP entries
+ */
+__STATIC_INLINE void __set_PMPCFGx(uint32_t idx, rv_csr_t pmpcfg)
+{
+    switch (idx) {
+        case 0: __RV_CSR_WRITE(CSR_PMPCFG0, pmpcfg); break;
+        case 1: __RV_CSR_WRITE(CSR_PMPCFG1, pmpcfg); break;
+        case 2: __RV_CSR_WRITE(CSR_PMPCFG2, pmpcfg); break;
+        case 3: __RV_CSR_WRITE(CSR_PMPCFG3, pmpcfg); break;
+        default: return;
+    }
+}
+
+/**
+ * \brief   Get PMPADDRx Register by index
+ * \details Return the content of the PMPADDRx Register.
+ * \param [in]    idx    PMP region index(0-15)
+ * \return               PMPADDRx Register value
+ */
+__STATIC_INLINE rv_csr_t __get_PMPADDRx(uint32_t idx)
+{
+    switch (idx) {
+        case 0: return __RV_CSR_READ(CSR_PMPADDR0);
+        case 1: return __RV_CSR_READ(CSR_PMPADDR1);
+        case 2: return __RV_CSR_READ(CSR_PMPADDR2);
+        case 3: return __RV_CSR_READ(CSR_PMPADDR3);
+        case 4: return __RV_CSR_READ(CSR_PMPADDR4);
+        case 5: return __RV_CSR_READ(CSR_PMPADDR5);
+        case 6: return __RV_CSR_READ(CSR_PMPADDR6);
+        case 7: return __RV_CSR_READ(CSR_PMPADDR7);
+        case 8: return __RV_CSR_READ(CSR_PMPADDR8);
+        case 9: return __RV_CSR_READ(CSR_PMPADDR9);
+        case 10: return __RV_CSR_READ(CSR_PMPADDR10);
+        case 11: return __RV_CSR_READ(CSR_PMPADDR11);
+        case 12: return __RV_CSR_READ(CSR_PMPADDR12);
+        case 13: return __RV_CSR_READ(CSR_PMPADDR13);
+        case 14: return __RV_CSR_READ(CSR_PMPADDR14);
+        case 15: return __RV_CSR_READ(CSR_PMPADDR15);
+        default: return 0;
+    }
+}
+
+/**
+ * \brief   Set PMPADDRx by index
+ * \details Write the given value to the PMPADDRx Register.
+ * \param [in]    idx      PMP region index(0-15)
+ * \param [in]    pmpaddr  PMPADDRx Register value to set
+ */
+__STATIC_INLINE void __set_PMPADDRx(uint32_t idx, rv_csr_t pmpaddr)
+{
+    switch (idx) {
+        case 0: __RV_CSR_WRITE(CSR_PMPADDR0, pmpaddr); break;
+        case 1: __RV_CSR_WRITE(CSR_PMPADDR1, pmpaddr); break;
+        case 2: __RV_CSR_WRITE(CSR_PMPADDR2, pmpaddr); break;
+        case 3: __RV_CSR_WRITE(CSR_PMPADDR3, pmpaddr); break;
+        case 4: __RV_CSR_WRITE(CSR_PMPADDR4, pmpaddr); break;
+        case 5: __RV_CSR_WRITE(CSR_PMPADDR5, pmpaddr); break;
+        case 6: __RV_CSR_WRITE(CSR_PMPADDR6, pmpaddr); break;
+        case 7: __RV_CSR_WRITE(CSR_PMPADDR7, pmpaddr); break;
+        case 8: __RV_CSR_WRITE(CSR_PMPADDR8, pmpaddr); break;
+        case 9: __RV_CSR_WRITE(CSR_PMPADDR9, pmpaddr); break;
+        case 10: __RV_CSR_WRITE(CSR_PMPADDR10, pmpaddr); break;
+        case 11: __RV_CSR_WRITE(CSR_PMPADDR11, pmpaddr); break;
+        case 12: __RV_CSR_WRITE(CSR_PMPADDR12, pmpaddr); break;
+        case 13: __RV_CSR_WRITE(CSR_PMPADDR13, pmpaddr); break;
+        case 14: __RV_CSR_WRITE(CSR_PMPADDR14, pmpaddr); break;
+        case 15: __RV_CSR_WRITE(CSR_PMPADDR15, pmpaddr); break;
+        default: return;
+    }
+}
+/** @} */ /* End of Doxygen Group NMSIS_Core_PMP_Functions */
+#endif /* defined(__PMP_PRESENT) && (__PMP_PRESENT == 1) */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /** __CORE_FEATURE_PMP_H__  */
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_timer.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_timer.h
new file mode 100644
index 000000000..6e9b7af39
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/core_feature_timer.h
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CORE_FEATURE_TIMER_H__
+#define __CORE_FEATURE_TIMER_H__
+/*!
+ * @file     core_feature_timer.h
+ * @brief    System Timer feature API header file for Nuclei N/NX Core
+ */
+/*
+ * System Timer Feature Configuration Macro:
+ * 1. __SYSTIMER_PRESENT:  Define whether Private System Timer is present or not.
+ *   * 0: Not present
+ *   * 1: Present
+ * 2. __SYSTIMER_BASEADDR:  Define the base address of the System Timer.
+ */
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if defined(__SYSTIMER_PRESENT) && (__SYSTIMER_PRESENT == 1)
+/**
+ * \defgroup NMSIS_Core_SysTimer_Registers     Register Define and Type Definitions Of System Timer
+ * \ingroup NMSIS_Core_Registers
+ * \brief   Type definitions and defines for system timer registers.
+ *
+ * @{
+ */
+/**
+ * \brief  Structure type to access the System Timer (SysTimer).
+ * \details
+ * Structure definition to access the system timer(SysTimer).
+ * \remarks
+ * - MSFTRST register is introduced in Nuclei N Core version 1.3(\ref __NUCLEI_N_REV >= 0x0103)
+ * - MSTOP register is renamed to MTIMECTL register in Nuclei N Core version 1.4(\ref __NUCLEI_N_REV >= 0x0104)
+ * - CMPCLREN and CLKSRC bit in MTIMECTL register is introduced in Nuclei N Core version 1.4(\ref __NUCLEI_N_REV >= 0x0104)
+ */
+typedef struct {
+    __IOM uint64_t MTIMER;                  /*!< Offset: 0x000 (R/W)  System Timer current value 64bits Register */
+    __IOM uint64_t MTIMERCMP;               /*!< Offset: 0x008 (R/W)  System Timer compare Value 64bits Register */
+    __IOM uint32_t RESERVED0[0x3F8];        /*!< Offset: 0x010 - 0xFEC Reserved */
+    __IOM uint32_t MSFTRST;                 /*!< Offset: 0xFF0 (R/W)  System Timer Software Core Reset Register */
+    __IOM uint32_t RESERVED1;               /*!< Offset: 0xFF4 Reserved */
+    __IOM uint32_t MTIMECTL;                /*!< Offset: 0xFF8 (R/W)  System Timer Control Register, previously MSTOP register */
+    __IOM uint32_t MSIP;                    /*!< Offset: 0xFFC (R/W)  System Timer SW interrupt Register */
+} SysTimer_Type;
+
+/* Timer Control / Status Register Definitions */
+#define SysTimer_MTIMECTL_TIMESTOP_Pos      0U                                          /*!< SysTick Timer MTIMECTL: TIMESTOP bit Position */
+#define SysTimer_MTIMECTL_TIMESTOP_Msk      (1UL << SysTimer_MTIMECTL_TIMESTOP_Pos)     /*!< SysTick Timer MTIMECTL: TIMESTOP Mask */
+#define SysTimer_MTIMECTL_CMPCLREN_Pos      1U                                          /*!< SysTick Timer MTIMECTL: CMPCLREN bit Position */
+#define SysTimer_MTIMECTL_CMPCLREN_Msk      (1UL << SysTimer_MTIMECTL_CMPCLREN_Pos)     /*!< SysTick Timer MTIMECTL: CMPCLREN Mask */
+#define SysTimer_MTIMECTL_CLKSRC_Pos        2U                                          /*!< SysTick Timer MTIMECTL: CLKSRC bit Position */
+#define SysTimer_MTIMECTL_CLKSRC_Msk        (1UL << SysTimer_MTIMECTL_CLKSRC_Pos)       /*!< SysTick Timer MTIMECTL: CLKSRC Mask */
+
+#define SysTimer_MSIP_MSIP_Pos              0U                                          /*!< SysTick Timer MSIP: MSIP bit Position */
+#define SysTimer_MSIP_MSIP_Msk              (1UL << SysTimer_MSIP_MSIP_Pos)             /*!< SysTick Timer MSIP: MSIP Mask */
+
+#define SysTimer_MTIMER_Msk                 (0xFFFFFFFFFFFFFFFFULL)                     /*!< SysTick Timer MTIMER value Mask */
+#define SysTimer_MTIMERCMP_Msk              (0xFFFFFFFFFFFFFFFFULL)                     /*!< SysTick Timer MTIMERCMP value Mask */
+#define SysTimer_MTIMECTL_Msk               (0xFFFFFFFFUL)                              /*!< SysTick Timer MTIMECTL/MSTOP value Mask */
+#define SysTimer_MSIP_Msk                   (0xFFFFFFFFUL)                              /*!< SysTick Timer MSIP   value Mask */
+#define SysTimer_MSFTRST_Msk                (0xFFFFFFFFUL)                              /*!< SysTick Timer MSFTRST value Mask */
+
+#define SysTimer_MSFRST_KEY                 (0x80000A5FUL)                              /*!< SysTick Timer Software Reset Request Key */
+
+#ifndef __SYSTIMER_BASEADDR
+/* Base address of SYSTIMER(__SYSTIMER_BASEADDR) should be defined in <Device.h> */
+#error "__SYSTIMER_BASEADDR is not defined, please check!"
+#endif
+/* System Timer Memory mapping of Device  */
+#define SysTimer_BASE                       __SYSTIMER_BASEADDR                         /*!< SysTick Base Address */
+#define SysTimer                            ((SysTimer_Type *) SysTimer_BASE)           /*!< SysTick configuration struct */
+/** @} */ /* end of group NMSIS_Core_SysTimer_Registers */
+
+/* ##################################    SysTimer function  ############################################ */
+/**
+ * \defgroup NMSIS_Core_SysTimer SysTimer Functions
+ * \brief    Functions that configure the Core System Timer.
+ * @{
+ */
+/**
+ * \brief  Set system timer load value
+ * \details
+ * This function set the system timer load value in MTIMER register.
+ * \param [in]  value   value to set system timer MTIMER register.
+ * \remarks
+ * - Load value is 64bits wide.
+ * - \ref SysTimer_GetLoadValue
+ */
+__STATIC_FORCEINLINE void SysTimer_SetLoadValue(uint64_t value)
+{
+    SysTimer->MTIMER = value;
+}
+
+/**
+ * \brief  Get system timer load value
+ * \details
+ * This function get the system timer current value in MTIMER register.
+ * \return  current value(64bit) of system timer MTIMER register.
+ * \remarks
+ * - Load value is 64bits wide.
+ * - \ref SysTimer_SetLoadValue
+ */
+__STATIC_FORCEINLINE uint64_t SysTimer_GetLoadValue(void)
+{
+    return SysTimer->MTIMER;
+}
+
+/**
+ * \brief  Set system timer compare value
+ * \details
+ * This function set the system Timer compare value in MTIMERCMP register.
+ * \param [in]  value   compare value to set system timer MTIMERCMP register.
+ * \remarks
+ * - Compare value is 64bits wide.
+ * - If compare value is larger than current value timer interrupt generate.
+ * - Modify the load value or compare value less to clear the interrupt.
+ * - \ref SysTimer_GetCompareValue
+ */
+__STATIC_FORCEINLINE void SysTimer_SetCompareValue(uint64_t value)
+{
+    SysTimer->MTIMERCMP = value;
+}
+
+/**
+ * \brief  Get system timer compare value
+ * \details
+ * This function get the system timer compare value in MTIMERCMP register.
+ * \return  compare value of system timer MTIMERCMP register.
+ * \remarks
+ * - Compare value is 64bits wide.
+ * - \ref SysTimer_SetCompareValue
+ */
+__STATIC_FORCEINLINE uint64_t SysTimer_GetCompareValue(void)
+{
+    return SysTimer->MTIMERCMP;
+}
+
+/**
+ * \brief  Enable system timer counter running
+ * \details
+ * Enable system timer counter running by clear
+ * TIMESTOP bit in MTIMECTL register.
+ */
+__STATIC_FORCEINLINE void SysTimer_Start(void)
+{
+    SysTimer->MTIMECTL &= ~(SysTimer_MTIMECTL_TIMESTOP_Msk);
+}
+
+/**
+ * \brief  Stop system timer counter running
+ * \details
+ * Stop system timer counter running by set
+ * TIMESTOP bit in MTIMECTL register.
+ */
+__STATIC_FORCEINLINE void SysTimer_Stop(void)
+{
+    SysTimer->MTIMECTL |= SysTimer_MTIMECTL_TIMESTOP_Msk;
+}
+
+/**
+ * \brief  Set system timer control value
+ * \details
+ * This function set the system timer MTIMECTL register value.
+ * \param [in]  mctl    value to set MTIMECTL register
+ * \remarks
+ * - Bit TIMESTOP is used to start and stop timer.
+ *   Clear TIMESTOP bit to 0 to start timer, otherwise to stop timer.
+ * - Bit CMPCLREN is used to enable auto MTIMER clear to zero when MTIMER >= MTIMERCMP.
+ *   Clear CMPCLREN bit to 0 to stop auto clear MTIMER feature, otherwise to enable it.
+ * - Bit CLKSRC is used to select timer clock source.
+ *   Clear CLKSRC bit to 0 to use *mtime_toggle_a*, otherwise use *core_clk_aon*
+ * - \ref SysTimer_GetControlValue
+ */
+__STATIC_FORCEINLINE void SysTimer_SetControlValue(uint32_t mctl)
+{
+    SysTimer->MTIMECTL = (mctl & SysTimer_MTIMECTL_Msk);
+}
+
+/**
+ * \brief  Get system timer control value
+ * \details
+ * This function get the system timer MTIMECTL register value.
+ * \return  MTIMECTL register value
+ * \remarks
+ * - \ref SysTimer_SetControlValue
+ */
+__STATIC_FORCEINLINE uint32_t SysTimer_GetControlValue(void)
+{
+    return (SysTimer->MTIMECTL & SysTimer_MTIMECTL_Msk);
+}
+
+/**
+ * \brief  Trigger or set software interrupt via system timer
+ * \details
+ * This function set the system timer MSIP bit in MSIP register.
+ * \remarks
+ * - Set system timer MSIP bit and generate a SW interrupt.
+ * - \ref SysTimer_ClearSWIRQ
+ * - \ref SysTimer_GetMsipValue
+ */
+__STATIC_FORCEINLINE void SysTimer_SetSWIRQ(void)
+{
+    SysTimer->MSIP |= SysTimer_MSIP_MSIP_Msk;
+}
+
+/**
+ * \brief  Clear system timer software interrupt pending request
+ * \details
+ * This function clear the system timer MSIP bit in MSIP register.
+ * \remarks
+ * - Clear system timer MSIP bit in MSIP register to clear the software interrupt pending.
+ * - \ref SysTimer_SetSWIRQ
+ * - \ref SysTimer_GetMsipValue
+ */
+__STATIC_FORCEINLINE void SysTimer_ClearSWIRQ(void)
+{
+    SysTimer->MSIP &= ~SysTimer_MSIP_MSIP_Msk;
+}
+
+/**
+ * \brief  Get system timer MSIP register value
+ * \details
+ * This function get the system timer MSIP register value.
+ * \return    Value of Timer MSIP register.
+ * \remarks
+ * - Bit0 is SW interrupt flag.
+ *   Bit0 is 1 then SW interrupt set. Bit0 is 0 then SW interrupt clear.
+ * - \ref SysTimer_SetSWIRQ
+ * - \ref SysTimer_ClearSWIRQ
+ */
+__STATIC_FORCEINLINE uint32_t SysTimer_GetMsipValue(void)
+{
+    return (uint32_t)(SysTimer->MSIP & SysTimer_MSIP_Msk);
+}
+
+/**
+ * \brief  Set system timer MSIP register value
+ * \details
+ * This function set the system timer MSIP register value.
+ * \param [in]  msip   value to set MSIP register
+ */
+__STATIC_FORCEINLINE void SysTimer_SetMsipValue(uint32_t msip)
+{
+    SysTimer->MSIP = (msip & SysTimer_MSIP_Msk);
+}
+
+/**
+ * \brief  Do software reset request
+ * \details
+ * This function will do software reset request through MTIMER
+ * - Software need to write \ref SysTimer_MSFRST_KEY to generate software reset request
+ * - The software request flag can be cleared by reset operation to clear
+ * \remarks
+ * - The software reset is sent to SoC, SoC need to generate reset signal and send back to Core
+ * - This function will not return, it will do while(1) to wait the Core reset happened
+ */
+__STATIC_FORCEINLINE void SysTimer_SoftwareReset(void)
+{
+    SysTimer->MSFTRST = SysTimer_MSFRST_KEY;
+    while(1);
+}
+
+#if defined (__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U) && defined(__ECLIC_PRESENT) && (__ECLIC_PRESENT == 1)
+/**
+ * \brief   System Tick Configuration
+ * \details Initializes the System Timer and its non-vector interrupt, and starts the System Tick Timer.
+ *
+ *  In our default implementation, the timer counter will be set to zero, and it will start a timer compare non-vector interrupt
+ *  when it matchs the ticks user set, during the timer interrupt user should reload the system tick using \ref SysTick_Reload function
+ *  or similar function written by user, so it can produce period timer interrupt.
+ * \param [in]  ticks  Number of ticks between two interrupts.
+ * \return          0  Function succeeded.
+ * \return          1  Function failed.
+ * \remarks
+ * - For \ref __NUCLEI_N_REV >= 0x0104, the CMPCLREN bit in MTIMECTL is introduced,
+ *   but we assume that the CMPCLREN bit is set to 0, so MTIMER register will not be
+ *   auto cleared to 0 when MTIMER >= MTIMERCMP.
+ * - When the variable \ref __Vendor_SysTickConfig is set to 1, then the
+ *   function \ref SysTick_Config is not included.
+ * - In this case, the file <b><Device>.h</b> must contain a vendor-specific implementation
+ *   of this function.
+ * - If user need this function to start a period timer interrupt, then in timer interrupt handler
+ *   routine code, user should call \ref SysTick_Reload with ticks to reload the timer.
+ * - This function only available when __SYSTIMER_PRESENT == 1 and __ECLIC_PRESENT == 1 and __Vendor_SysTickConfig == 0
+ * \sa
+ * - \ref SysTimer_SetCompareValue; SysTimer_SetLoadValue
+ */
+__STATIC_INLINE uint32_t SysTick_Config(uint64_t ticks)
+{
+    SysTimer_SetLoadValue(0);
+    SysTimer_SetCompareValue(ticks);
+    ECLIC_SetShvIRQ(SysTimer_IRQn, ECLIC_NON_VECTOR_INTERRUPT);
+    ECLIC_SetLevelIRQ(SysTimer_IRQn, 0);
+    ECLIC_EnableIRQ(SysTimer_IRQn);
+    return (0UL);
+}
+
+/**
+ * \brief   System Tick Reload
+ * \details Reload the System Timer Tick when the MTIMECMP reached TIME value
+ *
+ * \param [in]  ticks  Number of ticks between two interrupts.
+ * \return          0  Function succeeded.
+ * \return          1  Function failed.
+ * \remarks
+ * - For \ref __NUCLEI_N_REV >= 0x0104, the CMPCLREN bit in MTIMECTL is introduced,
+ *   but for this \ref SysTick_Config function, we assume this CMPCLREN bit is set to 0,
+ *   so in interrupt handler function, user still need to set the MTIMERCMP or MTIMER to reload
+ *   the system tick, if vendor want to use this timer's auto clear feature, they can define
+ *   \ref __Vendor_SysTickConfig to 1, and implement \ref SysTick_Config and \ref SysTick_Reload functions.
+ * - When the variable \ref __Vendor_SysTickConfig is set to 1, then the
+ *   function \ref SysTick_Reload is not included.
+ * - In this case, the file <b><Device>.h</b> must contain a vendor-specific implementation
+ *   of this function.
+ * - This function only available when __SYSTIMER_PRESENT == 1 and __ECLIC_PRESENT == 1 and __Vendor_SysTickConfig == 0
+ * - Since the MTIMERCMP value might overflow, if overflowed, MTIMER will be set to 0, and MTIMERCMP set to ticks
+ * \sa
+ * - \ref SysTimer_SetCompareValue
+ * - \ref SysTimer_SetLoadValue
+ */
+__STATIC_FORCEINLINE uint32_t SysTick_Reload(uint64_t ticks)
+{
+    uint64_t cur_ticks = SysTimer->MTIMER;
+    uint64_t reload_ticks = ticks + cur_ticks;
+
+    if (__USUALLY(reload_ticks > cur_ticks)) {
+        SysTimer->MTIMERCMP = reload_ticks;
+    } else {
+        /* When added the ticks value, then the MTIMERCMP < TIMER,
+         * which means the MTIMERCMP is overflowed,
+         * so we need to reset the counter to zero */
+        SysTimer->MTIMER = 0;
+        SysTimer->MTIMERCMP = ticks;
+    }
+
+    return (0UL);
+}
+
+#endif /* defined(__Vendor_SysTickConfig) && (__Vendor_SysTickConfig == 0U) */
+/** @} */ /* End of Doxygen Group NMSIS_Core_SysTimer */
+
+#endif /* defined(__SYSTIMER_PRESENT) && (__SYSTIMER_PRESENT == 1) */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /** __CORE_FEATURE_TIMER_H__  */
+
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_compiler.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_compiler.h
new file mode 100644
index 000000000..c5278db1b
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_compiler.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NMSIS_COMPILER_H
+#define __NMSIS_COMPILER_H
+
+#include <stdint.h>
+
+/*!
+ * @file     nmsis_compiler.h
+ * @brief    NMSIS compiler generic header file
+ */
+#if defined ( __GNUC__ )
+  /** GNU GCC Compiler */
+  #include "nmsis_gcc.h"
+#else
+  #error Unknown compiler.
+#endif
+
+
+#endif /* __NMSIS_COMPILER_H */
+
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_core.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_core.h
new file mode 100644
index 000000000..fa7821da1
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_core.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
+ * -- Adaptable modifications made for Nuclei Processors. --
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __NMSIS_CORE_H__
+#define __NMSIS_CORE_H__
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#include "nmsis_version.h"
+
+/**
+ * \ingroup NMSIS_Core_VersionControl
+ * @{
+ */
+/* The following enum __NUCLEI_N_REV/__NUCLEI_NX_REV definition in this file
+ * is only used for doxygen documentation generation,
+ * The <device>.h is the real file to define it by vendor
+ */
+#if defined(__ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__)
+/**
+ * \brief Nuclei N class core revision number
+ * \details
+ * Reversion number format: [15:8] revision number, [7:0] patch number
+ * \attention
+ * This define is exclusive with \ref __NUCLEI_NX_REV
+ */
+#define __NUCLEI_N_REV                   (0x0104)
+/**
+ * \brief Nuclei NX class core revision number
+ * \details
+ * Reversion number format: [15:8] revision number, [7:0] patch number
+ * \attention
+ * This define is exclusive with \ref __NUCLEI_N_REV
+ */
+#define __NUCLEI_NX_REV                  (0x0100)
+#endif /* __ONLY_FOR_DOXYGEN_DOCUMENT_GENERATION__ */
+/** @} */ /* End of Group NMSIS_Core_VersionControl */
+
+#include "nmsis_compiler.h"     /* NMSIS compiler specific defines */
+
+/* === Include Nuclei Core Related Headers === */
+/* Include core base feature header file */
+#include "core_feature_base.h"
+
+#ifndef __NMSIS_GENERIC
+/* Include core eclic feature header file */
+#include "core_feature_eclic.h"
+/* Include core systimer feature header file */
+#include "core_feature_timer.h"
+#endif
+
+/* Include core fpu feature header file */
+#include "core_feature_fpu.h"
+/* Include core dsp feature header file */
+#include "core_feature_dsp.h"
+/* Include core pmp feature header file */
+#include "core_feature_pmp.h"
+/* Include core cache feature header file */
+#include "core_feature_cache.h"
+
+/* Include compatiable functions header file */
+#include "core_compatiable.h"
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __NMSIS_CORE_H__ */
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_gcc.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_gcc.h
new file mode 100644
index 000000000..9f7eb9d26
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_gcc.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NMSIS_GCC_H__
+#define __NMSIS_GCC_H__
+/*!
+ * @file     nmsis_gcc.h
+ * @brief    NMSIS compiler GCC header file
+ */
+#include <stdint.h>
+#include "riscv_encoding.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* #########################  Startup and Lowlevel Init  ######################## */
+/**
+ * \defgroup NMSIS_Core_CompilerControl    Compiler Control
+ * \ingroup  NMSIS_Core
+ * \brief    Compiler agnostic \#define symbols for generic c/c++ source code
+ * \details
+ *
+ * The NMSIS-Core provides the header file <b>nmsis_compiler.h</b> with consistent \#define symbols for generate C or C++ source files that should be compiler agnostic.
+ * Each NMSIS compliant compiler should support the functionality described in this section.
+ *
+ * The header file <b>nmsis_compiler.h</b> is also included by each Device Header File <device.h> so that these definitions are available.
+ *   @{
+ */
+/* ignore some GCC warnings */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-conversion"
+#pragma GCC diagnostic ignored "-Wconversion"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+
+/* Fallback for __has_builtin */
+#ifndef __has_builtin
+  #define __has_builtin(x) (0)
+#endif
+
+/* NMSIS compiler specific defines */
+/** \brief Pass information from the compiler to the assembler. */
+#ifndef   __ASM
+  #define __ASM                                  __asm
+#endif
+
+/** \brief Recommend that function should be inlined by the compiler. */
+#ifndef   __INLINE
+  #define __INLINE                               inline
+#endif
+
+/** \brief Define a static function that may be inlined by the compiler. */
+#ifndef   __STATIC_INLINE
+  #define __STATIC_INLINE                        static inline
+#endif
+
+/** \brief Define a static function that should be always inlined by the compiler. */
+#ifndef   __STATIC_FORCEINLINE
+  #define __STATIC_FORCEINLINE                   __attribute__((always_inline)) static inline
+#endif
+
+/** \brief Inform the compiler that a function does not return. */
+#ifndef   __NO_RETURN
+  #define __NO_RETURN                            __attribute__((__noreturn__))
+#endif
+
+/** \brief Inform that a variable shall be retained in executable image. */
+#ifndef   __USED
+  #define __USED                                 __attribute__((used))
+#endif
+
+/** \brief restrict pointer qualifier to enable additional optimizations. */
+#ifndef   __WEAK
+  #define __WEAK                                 __attribute__((weak))
+#endif
+
+/** \brief specified the vector size of the variable, measured in bytes */
+#ifndef   __VECTOR_SIZE
+  #define __VECTOR_SIZE(x)                       __attribute__((vector_size(x)))
+#endif
+
+/** \brief Request smallest possible alignment. */
+#ifndef   __PACKED
+  #define __PACKED                               __attribute__((packed, aligned(1)))
+#endif
+
+/** \brief Request smallest possible alignment for a structure. */
+#ifndef   __PACKED_STRUCT
+  #define __PACKED_STRUCT                        struct __attribute__((packed, aligned(1)))
+#endif
+
+/** \brief Request smallest possible alignment for a union. */
+#ifndef   __PACKED_UNION
+  #define __PACKED_UNION                         union __attribute__((packed, aligned(1)))
+#endif
+
+#ifndef   __UNALIGNED_UINT16_WRITE
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  /** \brief Packed struct for unaligned uint16_t write access */
+  __PACKED_STRUCT T_UINT16_WRITE {
+      uint16_t v;
+  };
+  #pragma GCC diagnostic pop
+  /** \brief Pointer for unaligned write of a uint16_t variable. */
+  #define __UNALIGNED_UINT16_WRITE(addr, val)    (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+#endif
+
+#ifndef   __UNALIGNED_UINT16_READ
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  /** \brief Packed struct for unaligned uint16_t read access */
+  __PACKED_STRUCT T_UINT16_READ {
+      uint16_t v;
+  };
+  #pragma GCC diagnostic pop
+  /** \brief Pointer for unaligned read of a uint16_t variable. */
+  #define __UNALIGNED_UINT16_READ(addr)          (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+#endif
+
+#ifndef   __UNALIGNED_UINT32_WRITE
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  /** \brief Packed struct for unaligned uint32_t write access */
+  __PACKED_STRUCT T_UINT32_WRITE {
+      uint32_t v;
+  };
+  #pragma GCC diagnostic pop
+  /** \brief Pointer for unaligned write of a uint32_t variable. */
+  #define __UNALIGNED_UINT32_WRITE(addr, val)    (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+#endif
+
+#ifndef   __UNALIGNED_UINT32_READ
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wpacked"
+  #pragma GCC diagnostic ignored "-Wattributes"
+  /** \brief Packed struct for unaligned uint32_t read access */
+  __PACKED_STRUCT T_UINT32_READ {
+      uint32_t v;
+  };
+  #pragma GCC diagnostic pop
+  /** \brief Pointer for unaligned read of a uint32_t variable. */
+  #define __UNALIGNED_UINT32_READ(addr)          (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+#endif
+
+/** \brief Minimum `x` bytes alignment for a variable. */
+#ifndef   __ALIGNED
+  #define __ALIGNED(x)                           __attribute__((aligned(x)))
+#endif
+
+/** \brief restrict pointer qualifier to enable additional optimizations. */
+#ifndef   __RESTRICT
+  #define __RESTRICT                             __restrict
+#endif
+
+/** \brief Barrier to prevent compiler from reordering instructions. */
+#ifndef   __COMPILER_BARRIER
+  #define __COMPILER_BARRIER()                   __ASM volatile("":::"memory")
+#endif
+
+/** \brief provide the compiler with branch prediction information, the branch is usually true */
+#ifndef   __USUALLY
+  #define __USUALLY(exp)                         __builtin_expect((exp), 1)
+#endif
+
+/** \brief provide the compiler with branch prediction information, the branch is rarely true */
+#ifndef   __RARELY
+  #define __RARELY(exp)                          __builtin_expect((exp), 0)
+#endif
+
+/** \brief Use this attribute to indicate that the specified function is an interrupt handler. */
+#ifndef   __INTERRUPT
+  #define __INTERRUPT                            __attribute__((interrupt))
+#endif
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_CompilerControl */
+
+/* IO definitions (access restrictions to peripheral registers) */
+/**
+ * \defgroup NMSIS_Core_PeriphAccess     Peripheral Access
+ * \brief  Naming conventions and optional features for accessing peripherals.
+ *
+ * The section below describes the naming conventions, requirements, and optional features
+ * for accessing device specific peripherals.
+ * Most of the rules also apply to the core peripherals.
+ *
+ * The **Device Header File <device.h>** contains typically these definition
+ * and also includes the core specific header files.
+ *
+ * @{
+ */
+/** \brief Defines 'read only' permissions */
+#ifdef __cplusplus
+  #define   __I     volatile
+#else
+  #define   __I     volatile const
+#endif
+/** \brief Defines 'write only' permissions */
+#define     __O     volatile
+/** \brief Defines 'read / write' permissions */
+#define     __IO    volatile
+
+/* following defines should be used for structure members */
+/** \brief Defines 'read only' structure member permissions */
+#define     __IM     volatile const
+/** \brief Defines 'write only' structure member permissions */
+#define     __OM     volatile
+/** \brief Defines 'read/write' structure member permissions */
+#define     __IOM    volatile
+
+/**
+ * \brief   Mask and shift a bit field value for use in a register bit range.
+ * \details The macro \ref _VAL2FLD uses the #define's _Pos and _Msk of the related bit
+ * field to shift bit-field values for assigning to a register.
+ *
+ * **Example**:
+ * \code
+ * ECLIC->CFG = _VAL2FLD(CLIC_CLICCFG_NLBIT, 3);
+ * \endcode
+ * \param[in] field  Name of the register bit field.
+ * \param[in] value  Value of the bit field. This parameter is interpreted as an uint32_t type.
+ * \return           Masked and shifted value.
+ */
+#define _VAL2FLD(field, value)    (((uint32_t)(value) << field ## _Pos) & field ## _Msk)
+
+/**
+ * \brief   Mask and shift a register value to extract a bit filed value.
+ * \details The macro \ref _FLD2VAL uses the #define's _Pos and _Msk of the related bit
+ * field to extract the value of a bit field from a register.
+ *
+ * **Example**:
+ * \code
+ * nlbits = _FLD2VAL(CLIC_CLICCFG_NLBIT, ECLIC->CFG);
+ * \endcode
+ * \param[in] field  Name of the register bit field.
+ * \param[in] value  Value of register. This parameter is interpreted as an uint32_t type.
+ * \return           Masked and shifted bit field value.
+ */
+#define _FLD2VAL(field, value)    (((uint32_t)(value) & field ## _Msk) >> field ## _Pos)
+
+/** @} */ /* end of group NMSIS_Core_PeriphAccess */
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __NMSIS_GCC_H__ */
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_version.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_version.h
new file mode 100644
index 000000000..7f05e327e
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/nmsis_version.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __NMSIS_VERSION_H
+#define __NMSIS_VERSION_H
+
+/**
+ * \defgroup NMSIS_Core_VersionControl    Version Control
+ * \ingroup  NMSIS_Core
+ * \brief    Version \#define symbols for NMSIS release specific C/C++ source code
+ * \details
+ *
+ * We followed the [semantic versioning 2.0.0](https://semver.org/) to control NMSIS version.
+ * The version format is **MAJOR.MINOR.PATCH**, increment the:
+ * 1. MAJOR version when you make incompatible API changes,
+ * 2. MINOR version when you add functionality in a backwards compatible manner, and
+ * 3. PATCH version when you make backwards compatible bug fixes.
+ *
+ * The header file `nmsis_version.h` is included by each core header so that these definitions are available.
+ *
+ * **Example Usage for NMSIS Version Check**:
+ * \code
+ *   #if defined(__NMSIS_VERSION) && (__NMSIS_VERSION >= 0x00010105)
+ *      #warning "Yes, we have NMSIS 1.1.5 or later"
+ *   #else
+ *      #error "We need NMSIS 1.1.5 or later!"
+ *   #endif
+ * \endcode
+ *
+ * @{
+ */
+
+/*!
+ * \file     nmsis_version.h
+ * \brief    NMSIS Version definitions
+ **/
+
+/**
+ * \brief   Represent the NMSIS major version
+ * \details
+ * The NMSIS major version can be used to
+ * differentiate between NMSIS major releases.
+ * */
+#define __NMSIS_VERSION_MAJOR            (1U)
+
+/**
+ * \brief   Represent the NMSIS minor version
+ * \details
+ * The NMSIS minor version can be used to
+ * query a NMSIS release update including new features.
+ *
+ **/
+#define __NMSIS_VERSION_MINOR            (0U)
+
+/**
+ * \brief   Represent the NMSIS patch version
+ * \details
+ * The NMSIS patch version can be used to
+ * show bug fixes in this package.
+ **/
+#define __NMSIS_VERSION_PATCH            (0U)
+/**
+ * \brief   Represent the NMSIS Version
+ * \details
+ * NMSIS Version format: **MAJOR.MINOR.PATCH**
+ * * MAJOR: \ref __NMSIS_VERSION_MAJOR, stored in `bits [31:16]` of \ref __NMSIS_VERSION
+ * * MINOR: \ref __NMSIS_VERSION_MINOR, stored in `bits [15:8]` of \ref __NMSIS_VERSION
+ * * PATCH: \ref __NMSIS_VERSION_PATCH, stored in `bits [7:0]` of \ref __NMSIS_VERSION
+ **/
+#define __NMSIS_VERSION                  ((__NMSIS_VERSION_MAJOR << 16U) | (__NMSIS_VERSION_MINOR << 8) | __NMSIS_VERSION_PATCH)
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_VersionControl */
+#endif
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/riscv_bits.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/riscv_bits.h
new file mode 100644
index 000000000..a18c16863
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/riscv_bits.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __RISCV_BITS_H__
+#define __RISCV_BITS_H__
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#if __riscv_xlen == 64
+# define SLL32                  sllw
+# define STORE                  sd
+# define LOAD                   ld
+# define LWU                    lwu
+# define LOG_REGBYTES           3
+#else
+# define SLL32                  sll
+# define STORE                  sw
+# define LOAD                   lw
+# define LWU                    lw
+# define LOG_REGBYTES           2
+#endif /* __riscv_xlen */
+
+#define REGBYTES (1 << LOG_REGBYTES)
+
+#if __riscv_flen == 64
+# define FPSTORE                fsd
+# define FPLOAD                 fld
+# define LOG_FPREGBYTES         3
+#else
+# define FPSTORE                fsw
+# define FPLOAD                 flw
+# define LOG_FPREGBYTES         2
+#endif /* __riscv_flen */
+#define FPREGBYTES              (1 << LOG_FPREGBYTES)
+
+#define __rv_likely(x)          __builtin_expect((x), 1)
+#define __rv_unlikely(x)        __builtin_expect((x), 0)
+
+#define __RV_ROUNDUP(a, b)      ((((a)-1)/(b)+1)*(b))
+#define __RV_ROUNDDOWN(a, b)    ((a)/(b)*(b))
+
+#define __RV_MAX(a, b)          ((a) > (b) ? (a) : (b))
+#define __RV_MIN(a, b)          ((a) < (b) ? (a) : (b))
+#define __RV_CLAMP(a, lo, hi)   MIN(MAX(a, lo), hi)
+
+#define __RV_EXTRACT_FIELD(val, which)                  (((val) & (which)) / ((which) & ~((which)-1)))
+#define __RV_INSERT_FIELD(val, which, fieldval)         (((val) & ~(which)) | ((fieldval) * ((which) & ~((which)-1))))
+
+#ifdef __ASSEMBLY__
+#define _AC(X,Y)                X
+#define _AT(T,X)                X
+#else
+#define __AC(X,Y)               (X##Y)
+#define _AC(X,Y)                __AC(X,Y)
+#define _AT(T,X)                ((T)(X))
+#endif /* __ASSEMBLY__ */
+
+#define _UL(x)                  (_AC(x, UL))
+#define _ULL(x)                 (_AC(x, ULL))
+
+#define _BITUL(x)               (_UL(1) << (x))
+#define _BITULL(x)              (_ULL(1) << (x))
+
+#define UL(x)                   (_UL(x))
+#define ULL(x)                  (_ULL(x))
+
+#define STR(x)                  XSTR(x)
+#define XSTR(x)                 #x
+#define __STR(s)                #s
+#define STRINGIFY(s)            __STR(s)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /** __RISCV_BITS_H__  */
diff --git a/external/arch/riscv/nuclei/NMSIS/Core/Include/riscv_encoding.h b/external/arch/riscv/nuclei/NMSIS/Core/Include/riscv_encoding.h
new file mode 100644
index 000000000..899f8bbc6
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/Core/Include/riscv_encoding.h
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __RISCV_ENCODING_H__
+#define __RISCV_ENCODING_H__
+
+#include "riscv_bits.h"
+#ifdef __cplusplus
+ extern "C" {
+#endif
+/**
+ * \defgroup NMSIS_Core_CSR_Encoding    Core CSR Encodings
+ * \ingroup  NMSIS_Core
+ * \brief    NMSIS Core CSR Encodings
+ * \details
+ *
+ * The following macros are used for CSR encodings
+ *   @{
+ */
+#define MSTATUS_UIE         0x00000001
+#define MSTATUS_SIE         0x00000002
+#define MSTATUS_HIE         0x00000004
+#define MSTATUS_MIE         0x00000008
+#define MSTATUS_UPIE        0x00000010
+#define MSTATUS_SPIE        0x00000020
+#define MSTATUS_HPIE        0x00000040
+#define MSTATUS_MPIE        0x00000080
+#define MSTATUS_SPP         0x00000100
+#define MSTATUS_MPP         0x00001800
+#define MSTATUS_FS          0x00006000
+#define MSTATUS_XS          0x00018000
+#define MSTATUS_MPRV        0x00020000
+#define MSTATUS_PUM         0x00040000
+#define MSTATUS_MXR         0x00080000
+#define MSTATUS_VM          0x1F000000
+#define MSTATUS32_SD        0x80000000
+#define MSTATUS64_SD        0x8000000000000000
+
+#define MSTATUS_FS_INITIAL  0x00002000
+#define MSTATUS_FS_CLEAN    0x00004000
+#define MSTATUS_FS_DIRTY    0x00006000
+
+#define SSTATUS_UIE         0x00000001
+#define SSTATUS_SIE         0x00000002
+#define SSTATUS_UPIE        0x00000010
+#define SSTATUS_SPIE        0x00000020
+#define SSTATUS_SPP         0x00000100
+#define SSTATUS_FS          0x00006000
+#define SSTATUS_XS          0x00018000
+#define SSTATUS_PUM         0x00040000
+#define SSTATUS32_SD        0x80000000
+#define SSTATUS64_SD        0x8000000000000000
+
+#define CSR_MCACHE_CTL_IE   0x00000001
+#define CSR_MCACHE_CTL_DE   0x00010000
+
+#define DCSR_XDEBUGVER      (3U<<30)
+#define DCSR_NDRESET        (1<<29)
+#define DCSR_FULLRESET      (1<<28)
+#define DCSR_EBREAKM        (1<<15)
+#define DCSR_EBREAKH        (1<<14)
+#define DCSR_EBREAKS        (1<<13)
+#define DCSR_EBREAKU        (1<<12)
+#define DCSR_STOPCYCLE      (1<<10)
+#define DCSR_STOPTIME       (1<<9)
+#define DCSR_CAUSE          (7<<6)
+#define DCSR_DEBUGINT       (1<<5)
+#define DCSR_HALT           (1<<3)
+#define DCSR_STEP           (1<<2)
+#define DCSR_PRV            (3<<0)
+
+#define DCSR_CAUSE_NONE     0
+#define DCSR_CAUSE_SWBP     1
+#define DCSR_CAUSE_HWBP     2
+#define DCSR_CAUSE_DEBUGINT 3
+#define DCSR_CAUSE_STEP     4
+#define DCSR_CAUSE_HALT     5
+
+#define MCONTROL_TYPE(xlen)    (0xfULL<<((xlen)-4))
+#define MCONTROL_DMODE(xlen)   (1ULL<<((xlen)-5))
+#define MCONTROL_MASKMAX(xlen) (0x3fULL<<((xlen)-11))
+
+#define MCONTROL_SELECT     (1<<19)
+#define MCONTROL_TIMING     (1<<18)
+#define MCONTROL_ACTION     (0x3f<<12)
+#define MCONTROL_CHAIN      (1<<11)
+#define MCONTROL_MATCH      (0xf<<7)
+#define MCONTROL_M          (1<<6)
+#define MCONTROL_H          (1<<5)
+#define MCONTROL_S          (1<<4)
+#define MCONTROL_U          (1<<3)
+#define MCONTROL_EXECUTE    (1<<2)
+#define MCONTROL_STORE      (1<<1)
+#define MCONTROL_LOAD       (1<<0)
+
+#define MCONTROL_TYPE_NONE      0
+#define MCONTROL_TYPE_MATCH     2
+
+#define MCONTROL_ACTION_DEBUG_EXCEPTION   0
+#define MCONTROL_ACTION_DEBUG_MODE        1
+#define MCONTROL_ACTION_TRACE_START       2
+#define MCONTROL_ACTION_TRACE_STOP        3
+#define MCONTROL_ACTION_TRACE_EMIT        4
+
+#define MCONTROL_MATCH_EQUAL     0
+#define MCONTROL_MATCH_NAPOT     1
+#define MCONTROL_MATCH_GE        2
+#define MCONTROL_MATCH_LT        3
+#define MCONTROL_MATCH_MASK_LOW  4
+#define MCONTROL_MATCH_MASK_HIGH 5
+
+#define MIP_SSIP            (1 << IRQ_S_SOFT)
+#define MIP_HSIP            (1 << IRQ_H_SOFT)
+#define MIP_MSIP            (1 << IRQ_M_SOFT)
+#define MIP_STIP            (1 << IRQ_S_TIMER)
+#define MIP_HTIP            (1 << IRQ_H_TIMER)
+#define MIP_MTIP            (1 << IRQ_M_TIMER)
+#define MIP_SEIP            (1 << IRQ_S_EXT)
+#define MIP_HEIP            (1 << IRQ_H_EXT)
+#define MIP_MEIP            (1 << IRQ_M_EXT)
+
+#define MIE_SSIE            MIP_SSIP
+#define MIE_HSIE            MIP_HSIP
+#define MIE_MSIE            MIP_MSIP
+#define MIE_STIE            MIP_STIP
+#define MIE_HTIE            MIP_HTIP
+#define MIE_MTIE            MIP_MTIP
+#define MIE_SEIE            MIP_SEIP
+#define MIE_HEIE            MIP_HEIP
+#define MIE_MEIE            MIP_MEIP
+
+/* === Nuclei custom CSR bit mask === */
+
+#define WFE_WFE                     (0x1)
+#define TXEVT_TXEVT                 (0x1)
+#define SLEEPVALUE_SLEEPVALUE       (0x1)
+
+#define MCOUNTINHIBIT_IR            (1<<2)
+#define MCOUNTINHIBIT_CY            (1<<0)
+
+#define MILM_CTL_ILM_BPA            (((1ULL<<((__riscv_xlen)-10))-1)<<10)
+#define MILM_CTL_ILM_EN             (1<<0)
+
+#define MDLM_CTL_DLM_BPA            (((1ULL<<((__riscv_xlen)-10))-1)<<10)
+#define MDLM_CTL_DLM_EN             (1<<0)
+
+#define MSUBM_PTYP                  (0x3<<8)
+#define MSUBM_TYP                   (0x3<<6)
+
+#define MDCAUSE_MDCAUSE             (0x3)
+
+#define MMISC_CTL_NMI_CAUSE_FFF     (1<<9)
+#define MMISC_CTL_MISALIGN          (1<<6)
+#define MMISC_CTL_BPU               (1<<3)
+
+#define MCACHE_CTL_IC_EN            (1<<0)
+#define MCACHE_CTL_IC_SCPD_MOD      (1<<1)
+#define MCACHE_CTL_DC_EN            (1<<16)
+
+#define MTVT2_MTVT2EN               (1<<0)
+#define MTVT2_COMMON_CODE_ENTRY     (((1ULL<<((__riscv_xlen)-2))-1)<<2)
+
+#define MCFG_INFO_TEE               (1<<0)
+#define MCFG_INFO_ECC               (1<<1)
+#define MCFG_INFO_CLIC              (1<<2)
+#define MCFG_INFO_PLIC              (1<<3)
+#define MCFG_INFO_FIO               (1<<4)
+#define MCFG_INFO_PPI               (1<<5)
+#define MCFG_INFO_NICE              (1<<6)
+#define MCFG_INFO_ILM               (1<<7)
+#define MCFG_INFO_DLM               (1<<8)
+#define MCFG_INFO_ICACHE            (1<<9)
+#define MCFG_INFO_DCACHE            (1<<10)
+
+#define MICFG_IC_SET                (0xF<<0)
+#define MICFG_IC_WAY                (0x7<<4)
+#define MICFG_IC_LSIZE              (0x7<<7)
+#define MICFG_ILM_SIZE              (0x1F<<16)
+#define MICFG_ILM_XONLY             (1<<21)
+
+#define MDCFG_DC_SET                (0xF<<0)
+#define MDCFG_DC_WAY                (0x7<<4)
+#define MDCFG_DC_LSIZE              (0x7<<7)
+#define MDCFG_DLM_SIZE              (0x1F<<16)
+
+#define MPPICFG_INFO_PPI_SIZE       (0x1F<<1)
+#define MPPICFG_INFO_PPI_BPA        (((1ULL<<((__riscv_xlen)-10))-1)<<10)
+
+#define MFIOCFG_INFO_FIO_SIZE       (0x1F<<1)
+#define MFIOCFG_INFO_FIO_BPA        (((1ULL<<((__riscv_xlen)-10))-1)<<10)
+
+#define SIP_SSIP MIP_SSIP
+#define SIP_STIP MIP_STIP
+
+#define PRV_U 0
+#define PRV_S 1
+#define PRV_H 2
+#define PRV_M 3
+
+#define VM_MBARE 0
+#define VM_MBB   1
+#define VM_MBBID 2
+#define VM_SV32  8
+#define VM_SV39  9
+#define VM_SV48  10
+
+#define IRQ_S_SOFT   1
+#define IRQ_H_SOFT   2
+#define IRQ_M_SOFT   3
+#define IRQ_S_TIMER  5
+#define IRQ_H_TIMER  6
+#define IRQ_M_TIMER  7
+#define IRQ_S_EXT    9
+#define IRQ_H_EXT    10
+#define IRQ_M_EXT    11
+#define IRQ_COP      12
+#define IRQ_HOST     13
+
+#define DEFAULT_RSTVEC     0x00001000
+#define DEFAULT_NMIVEC     0x00001004
+#define DEFAULT_MTVEC      0x00001010
+#define CONFIG_STRING_ADDR 0x0000100C
+#define EXT_IO_BASE        0x40000000
+#define DRAM_BASE          0x80000000
+
+/* === FPU FRM Rounding Mode === */
+/** FPU Round to Nearest, ties to Even*/
+#define FRM_RNDMODE_RNE     0x0
+/** FPU Round Towards Zero */
+#define FRM_RNDMODE_RTZ     0x1
+/** FPU Round Down (towards -inf) */
+#define FRM_RNDMODE_RDN     0x2
+/** FPU Round Up (towards +inf) */
+#define FRM_RNDMODE_RUP     0x3
+/** FPU Round to nearest, ties to Max Magnitude */
+#define FRM_RNDMODE_RMM     0x4
+/**
+ * In instruction's rm, selects dynamic rounding mode.
+ * In Rounding Mode register, Invalid */
+#define FRM_RNDMODE_DYN     0x7
+
+/* === FPU FFLAGS Accrued Exceptions === */
+/** FPU Inexact */
+#define FFLAGS_AE_NX        (1<<0)
+/** FPU Underflow */
+#define FFLAGS_AE_UF        (1<<1)
+/** FPU Overflow */
+#define FFLAGS_AE_OF        (1<<2)
+/** FPU Divide by Zero */
+#define FFLAGS_AE_DZ        (1<<3)
+/** FPU Invalid Operation */
+#define FFLAGS_AE_NV        (1<<4)
+
+/** Floating Point Register f0-f31, eg. f0 -> FREG(0) */
+#define FREG(idx)           f##idx
+
+
+/* === PMP CFG Bits === */
+#define PMP_R				0x01
+#define PMP_W				0x02
+#define PMP_X				0x04
+#define PMP_A				0x18
+#define PMP_A_TOR			0x08
+#define PMP_A_NA4			0x10
+#define PMP_A_NAPOT			0x18
+#define PMP_L				0x80
+
+#define PMP_SHIFT			2
+#define PMP_COUNT			16
+
+// page table entry (PTE) fields
+#define PTE_V     0x001 // Valid
+#define PTE_R     0x002 // Read
+#define PTE_W     0x004 // Write
+#define PTE_X     0x008 // Execute
+#define PTE_U     0x010 // User
+#define PTE_G     0x020 // Global
+#define PTE_A     0x040 // Accessed
+#define PTE_D     0x080 // Dirty
+#define PTE_SOFT  0x300 // Reserved for Software
+
+#define PTE_PPN_SHIFT 10
+
+#define PTE_TABLE(PTE) (((PTE) & (PTE_V | PTE_R | PTE_W | PTE_X)) == PTE_V)
+
+#ifdef __riscv
+
+#ifdef __riscv64
+# define MSTATUS_SD MSTATUS64_SD
+# define SSTATUS_SD SSTATUS64_SD
+# define RISCV_PGLEVEL_BITS 9
+#else
+# define MSTATUS_SD MSTATUS32_SD
+# define SSTATUS_SD SSTATUS32_SD
+# define RISCV_PGLEVEL_BITS 10
+#endif /* __riscv64 */
+
+#define RISCV_PGSHIFT 12
+#define RISCV_PGSIZE (1 << RISCV_PGSHIFT)
+
+#endif /* __riscv */
+
+#define DOWNLOAD_MODE_FLASHXIP  0
+#define DOWNLOAD_MODE_FLASH     1
+#define DOWNLOAD_MODE_ILM       2
+#define DOWNLOAD_MODE_DDR       3
+
+/**
+ * \defgroup NMSIS_Core_CSR_Registers    Core CSR Registers
+ * \ingroup  NMSIS_Core
+ * \brief    NMSIS Core CSR Register Definitions
+ * \details
+ *
+ * The following macros are used for CSR Register Defintions.
+ *   @{
+ */
+/* === Standard RISC-V CSR Registers === */
+#define CSR_USTATUS	0x0
+#define CSR_FFLAGS 0x1
+#define CSR_FRM 0x2
+#define CSR_FCSR 0x3
+#define CSR_CYCLE 0xc00
+#define CSR_TIME 0xc01
+#define CSR_INSTRET 0xc02
+#define CSR_HPMCOUNTER3 0xc03
+#define CSR_HPMCOUNTER4 0xc04
+#define CSR_HPMCOUNTER5 0xc05
+#define CSR_HPMCOUNTER6 0xc06
+#define CSR_HPMCOUNTER7 0xc07
+#define CSR_HPMCOUNTER8 0xc08
+#define CSR_HPMCOUNTER9 0xc09
+#define CSR_HPMCOUNTER10 0xc0a
+#define CSR_HPMCOUNTER11 0xc0b
+#define CSR_HPMCOUNTER12 0xc0c
+#define CSR_HPMCOUNTER13 0xc0d
+#define CSR_HPMCOUNTER14 0xc0e
+#define CSR_HPMCOUNTER15 0xc0f
+#define CSR_HPMCOUNTER16 0xc10
+#define CSR_HPMCOUNTER17 0xc11
+#define CSR_HPMCOUNTER18 0xc12
+#define CSR_HPMCOUNTER19 0xc13
+#define CSR_HPMCOUNTER20 0xc14
+#define CSR_HPMCOUNTER21 0xc15
+#define CSR_HPMCOUNTER22 0xc16
+#define CSR_HPMCOUNTER23 0xc17
+#define CSR_HPMCOUNTER24 0xc18
+#define CSR_HPMCOUNTER25 0xc19
+#define CSR_HPMCOUNTER26 0xc1a
+#define CSR_HPMCOUNTER27 0xc1b
+#define CSR_HPMCOUNTER28 0xc1c
+#define CSR_HPMCOUNTER29 0xc1d
+#define CSR_HPMCOUNTER30 0xc1e
+#define CSR_HPMCOUNTER31 0xc1f
+#define CSR_SSTATUS 0x100
+#define CSR_SIE 0x104
+#define CSR_STVEC 0x105
+#define CSR_SSCRATCH 0x140
+#define CSR_SEPC 0x141
+#define CSR_SCAUSE 0x142
+#define CSR_SBADADDR 0x143
+#define CSR_SIP 0x144
+#define CSR_SPTBR 0x180
+#define CSR_MSTATUS 0x300
+#define CSR_MISA 0x301
+#define CSR_MEDELEG 0x302
+#define CSR_MIDELEG 0x303
+#define CSR_MIE 0x304
+#define CSR_MTVEC 0x305
+#define CSR_MCOUNTEREN 0x306
+#define CSR_MSCRATCH 0x340
+#define CSR_MEPC 0x341
+#define CSR_MCAUSE 0x342
+#define CSR_MBADADDR 0x343
+#define CSR_MTVAL 0x343
+#define CSR_MIP 0x344
+#define CSR_PMPCFG0	0x3a0
+#define CSR_PMPCFG1	0x3a1
+#define CSR_PMPCFG2	0x3a2
+#define CSR_PMPCFG3	0x3a3
+#define CSR_PMPADDR0 0x3b0
+#define CSR_PMPADDR1 0x3b1
+#define CSR_PMPADDR2 0x3b2
+#define CSR_PMPADDR3 0x3b3
+#define CSR_PMPADDR4 0x3b4
+#define CSR_PMPADDR5 0x3b5
+#define CSR_PMPADDR6 0x3b6
+#define CSR_PMPADDR7 0x3b7
+#define CSR_PMPADDR8 0x3b8
+#define CSR_PMPADDR9 0x3b9
+#define CSR_PMPADDR10 0x3ba
+#define CSR_PMPADDR11 0x3bb
+#define CSR_PMPADDR12 0x3bc
+#define CSR_PMPADDR13 0x3bd
+#define CSR_PMPADDR14 0x3be
+#define CSR_PMPADDR15 0x3bf
+#define CSR_TSELECT 0x7a0
+#define CSR_TDATA1 0x7a1
+#define CSR_TDATA2 0x7a2
+#define CSR_TDATA3 0x7a3
+#define CSR_DCSR 0x7b0
+#define CSR_DPC 0x7b1
+#define CSR_DSCRATCH 0x7b2
+#define CSR_MCYCLE 0xb00
+#define CSR_MINSTRET 0xb02
+#define CSR_MHPMCOUNTER3 0xb03
+#define CSR_MHPMCOUNTER4 0xb04
+#define CSR_MHPMCOUNTER5 0xb05
+#define CSR_MHPMCOUNTER6 0xb06
+#define CSR_MHPMCOUNTER7 0xb07
+#define CSR_MHPMCOUNTER8 0xb08
+#define CSR_MHPMCOUNTER9 0xb09
+#define CSR_MHPMCOUNTER10 0xb0a
+#define CSR_MHPMCOUNTER11 0xb0b
+#define CSR_MHPMCOUNTER12 0xb0c
+#define CSR_MHPMCOUNTER13 0xb0d
+#define CSR_MHPMCOUNTER14 0xb0e
+#define CSR_MHPMCOUNTER15 0xb0f
+#define CSR_MHPMCOUNTER16 0xb10
+#define CSR_MHPMCOUNTER17 0xb11
+#define CSR_MHPMCOUNTER18 0xb12
+#define CSR_MHPMCOUNTER19 0xb13
+#define CSR_MHPMCOUNTER20 0xb14
+#define CSR_MHPMCOUNTER21 0xb15
+#define CSR_MHPMCOUNTER22 0xb16
+#define CSR_MHPMCOUNTER23 0xb17
+#define CSR_MHPMCOUNTER24 0xb18
+#define CSR_MHPMCOUNTER25 0xb19
+#define CSR_MHPMCOUNTER26 0xb1a
+#define CSR_MHPMCOUNTER27 0xb1b
+#define CSR_MHPMCOUNTER28 0xb1c
+#define CSR_MHPMCOUNTER29 0xb1d
+#define CSR_MHPMCOUNTER30 0xb1e
+#define CSR_MHPMCOUNTER31 0xb1f
+#define CSR_MUCOUNTEREN 0x320
+#define CSR_MSCOUNTEREN 0x321
+#define CSR_MHPMEVENT3 0x323
+#define CSR_MHPMEVENT4 0x324
+#define CSR_MHPMEVENT5 0x325
+#define CSR_MHPMEVENT6 0x326
+#define CSR_MHPMEVENT7 0x327
+#define CSR_MHPMEVENT8 0x328
+#define CSR_MHPMEVENT9 0x329
+#define CSR_MHPMEVENT10 0x32a
+#define CSR_MHPMEVENT11 0x32b
+#define CSR_MHPMEVENT12 0x32c
+#define CSR_MHPMEVENT13 0x32d
+#define CSR_MHPMEVENT14 0x32e
+#define CSR_MHPMEVENT15 0x32f
+#define CSR_MHPMEVENT16 0x330
+#define CSR_MHPMEVENT17 0x331
+#define CSR_MHPMEVENT18 0x332
+#define CSR_MHPMEVENT19 0x333
+#define CSR_MHPMEVENT20 0x334
+#define CSR_MHPMEVENT21 0x335
+#define CSR_MHPMEVENT22 0x336
+#define CSR_MHPMEVENT23 0x337
+#define CSR_MHPMEVENT24 0x338
+#define CSR_MHPMEVENT25 0x339
+#define CSR_MHPMEVENT26 0x33a
+#define CSR_MHPMEVENT27 0x33b
+#define CSR_MHPMEVENT28 0x33c
+#define CSR_MHPMEVENT29 0x33d
+#define CSR_MHPMEVENT30 0x33e
+#define CSR_MHPMEVENT31 0x33f
+#define CSR_MVENDORID 0xf11
+#define CSR_MARCHID 0xf12
+#define CSR_MIMPID 0xf13
+#define CSR_MHARTID 0xf14
+#define CSR_CYCLEH 0xc80
+#define CSR_TIMEH 0xc81
+#define CSR_INSTRETH 0xc82
+#define CSR_HPMCOUNTER3H 0xc83
+#define CSR_HPMCOUNTER4H 0xc84
+#define CSR_HPMCOUNTER5H 0xc85
+#define CSR_HPMCOUNTER6H 0xc86
+#define CSR_HPMCOUNTER7H 0xc87
+#define CSR_HPMCOUNTER8H 0xc88
+#define CSR_HPMCOUNTER9H 0xc89
+#define CSR_HPMCOUNTER10H 0xc8a
+#define CSR_HPMCOUNTER11H 0xc8b
+#define CSR_HPMCOUNTER12H 0xc8c
+#define CSR_HPMCOUNTER13H 0xc8d
+#define CSR_HPMCOUNTER14H 0xc8e
+#define CSR_HPMCOUNTER15H 0xc8f
+#define CSR_HPMCOUNTER16H 0xc90
+#define CSR_HPMCOUNTER17H 0xc91
+#define CSR_HPMCOUNTER18H 0xc92
+#define CSR_HPMCOUNTER19H 0xc93
+#define CSR_HPMCOUNTER20H 0xc94
+#define CSR_HPMCOUNTER21H 0xc95
+#define CSR_HPMCOUNTER22H 0xc96
+#define CSR_HPMCOUNTER23H 0xc97
+#define CSR_HPMCOUNTER24H 0xc98
+#define CSR_HPMCOUNTER25H 0xc99
+#define CSR_HPMCOUNTER26H 0xc9a
+#define CSR_HPMCOUNTER27H 0xc9b
+#define CSR_HPMCOUNTER28H 0xc9c
+#define CSR_HPMCOUNTER29H 0xc9d
+#define CSR_HPMCOUNTER30H 0xc9e
+#define CSR_HPMCOUNTER31H 0xc9f
+#define CSR_MCYCLEH 0xb80
+#define CSR_MINSTRETH 0xb82
+#define CSR_MHPMCOUNTER3H 0xb83
+#define CSR_MHPMCOUNTER4H 0xb84
+#define CSR_MHPMCOUNTER5H 0xb85
+#define CSR_MHPMCOUNTER6H 0xb86
+#define CSR_MHPMCOUNTER7H 0xb87
+#define CSR_MHPMCOUNTER8H 0xb88
+#define CSR_MHPMCOUNTER9H 0xb89
+#define CSR_MHPMCOUNTER10H 0xb8a
+#define CSR_MHPMCOUNTER11H 0xb8b
+#define CSR_MHPMCOUNTER12H 0xb8c
+#define CSR_MHPMCOUNTER13H 0xb8d
+#define CSR_MHPMCOUNTER14H 0xb8e
+#define CSR_MHPMCOUNTER15H 0xb8f
+#define CSR_MHPMCOUNTER16H 0xb90
+#define CSR_MHPMCOUNTER17H 0xb91
+#define CSR_MHPMCOUNTER18H 0xb92
+#define CSR_MHPMCOUNTER19H 0xb93
+#define CSR_MHPMCOUNTER20H 0xb94
+#define CSR_MHPMCOUNTER21H 0xb95
+#define CSR_MHPMCOUNTER22H 0xb96
+#define CSR_MHPMCOUNTER23H 0xb97
+#define CSR_MHPMCOUNTER24H 0xb98
+#define CSR_MHPMCOUNTER25H 0xb99
+#define CSR_MHPMCOUNTER26H 0xb9a
+#define CSR_MHPMCOUNTER27H 0xb9b
+#define CSR_MHPMCOUNTER28H 0xb9c
+#define CSR_MHPMCOUNTER29H 0xb9d
+#define CSR_MHPMCOUNTER30H 0xb9e
+#define CSR_MHPMCOUNTER31H 0xb9f
+
+/* === CLIC CSR Registers === */
+#define CSR_MTVT                0x307
+#define CSR_MNXTI               0x345
+#define CSR_MINTSTATUS          0x346
+#define CSR_MSCRATCHCSW         0x348
+#define CSR_MSCRATCHCSWL        0x349
+#define CSR_MCLICBASE           0x350
+
+/* === Nuclei custom CSR Registers === */
+#define CSR_MCOUNTINHIBIT       0x320
+#define CSR_MILM_CTL            0x7C0
+#define CSR_MDLM_CTL            0x7C1
+#define CSR_MNVEC               0x7C3
+#define CSR_MSUBM               0x7C4
+#define CSR_MDCAUSE             0x7C9
+#define CSR_MCACHE_CTL          0x7CA
+#define CSR_MMISC_CTL           0x7D0
+#define CSR_MSAVESTATUS         0x7D6
+#define CSR_MSAVEEPC1           0x7D7
+#define CSR_MSAVECAUSE1         0x7D8
+#define CSR_MSAVEEPC2           0x7D9
+#define CSR_MSAVECAUSE2         0x7DA
+#define CSR_MSAVEDCAUSE1        0x7DB
+#define CSR_MSAVEDCAUSE2        0x7DC
+#define CSR_PUSHMSUBM           0x7EB
+#define CSR_MTVT2               0x7EC
+#define CSR_JALMNXTI            0x7ED
+#define CSR_PUSHMCAUSE          0x7EE
+#define CSR_PUSHMEPC            0x7EF
+#define CSR_MPPICFG_INFO        0x7F0
+#define CSR_MFIOCFG_INFO        0x7F1
+#define CSR_SLEEPVALUE          0x811
+#define CSR_TXEVT               0x812
+#define CSR_WFE                 0x810
+#define CSR_MICFG_INFO          0xFC0
+#define CSR_MDCFG_INFO          0xFC1
+#define CSR_MCFG_INFO           0xFC2
+
+/** @} */ /** End of Doxygen Group NMSIS_Core_CSR_Registers **/
+
+/* Exception Code in MCAUSE CSR */
+#define CAUSE_MISALIGNED_FETCH 0x0
+#define CAUSE_FAULT_FETCH 0x1
+#define CAUSE_ILLEGAL_INSTRUCTION 0x2
+#define CAUSE_BREAKPOINT 0x3
+#define CAUSE_MISALIGNED_LOAD 0x4
+#define CAUSE_FAULT_LOAD 0x5
+#define CAUSE_MISALIGNED_STORE 0x6
+#define CAUSE_FAULT_STORE 0x7
+#define CAUSE_USER_ECALL 0x8
+#define CAUSE_SUPERVISOR_ECALL 0x9
+#define CAUSE_HYPERVISOR_ECALL 0xa
+#define CAUSE_MACHINE_ECALL 0xb
+
+/* Exception Subcode in MDCAUSE CSR */
+#define DCAUSE_FAULT_FETCH_PMP      0x1
+#define DCAUSE_FAULT_FETCH_INST     0x2
+
+#define DCAUSE_FAULT_LOAD_PMP       0x1
+#define DCAUSE_FAULT_LOAD_INST      0x2
+#define DCAUSE_FAULT_LOAD_NICE      0x3
+
+#define DCAUSE_FAULT_STORE_PMP      0x1
+#define DCAUSE_FAULT_STORE_INST     0x2
+
+/** @} */ /** End of Doxygen Group NMSIS_Core_CSR_Encoding **/
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __RISCV_ENCODING_H__ */
diff --git a/external/arch/riscv/nuclei/NMSIS/rules.mk b/external/arch/riscv/nuclei/NMSIS/rules.mk
new file mode 100644
index 000000000..17c320bf9
--- /dev/null
+++ b/external/arch/riscv/nuclei/NMSIS/rules.mk
@@ -0,0 +1,3 @@
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+GLOBAL_INCLUDES += $(LOCAL_DIR)/Core/Include
diff --git a/external/platform/hbird/NMSIS/hbird.h b/external/platform/hbird/NMSIS/hbird.h
new file mode 100644
index 000000000..a2afb00ea
--- /dev/null
+++ b/external/platform/hbird/NMSIS/hbird.h
@@ -0,0 +1,459 @@
+/******************************************************************************
+ * @file     hbird.h
+ * @brief    NMSIS Core Peripheral Access Layer Header File for
+ *           Nuclei HummingBird evaluation SoC which support Nuclei N/NX class cores
+ * @version  V1.00
+ * @date     22. Nov 2019
+ ******************************************************************************/
+/*
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __HBIRD_H__
+#define __HBIRD_H__
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @addtogroup Nuclei
+  * @{
+  */
+
+
+/** @addtogroup hbird
+  * @{
+  */
+
+
+/** @addtogroup Configuration_of_NMSIS
+  * @{
+  */
+
+
+
+/* =========================================================================================================================== */
+/* ================                                Interrupt Number Definition                                ================ */
+/* =========================================================================================================================== */
+
+typedef enum IRQn
+{
+/* =======================================  Nuclei Core Specific Interrupt Numbers  ======================================== */
+
+    Reserved0_IRQn            =   0,              /*!<  Internal reserved */
+    Reserved1_IRQn            =   1,              /*!<  Internal reserved */
+    Reserved2_IRQn            =   2,              /*!<  Internal reserved */
+    SysTimerSW_IRQn           =   3,              /*!<  System Timer SW interrupt */
+    Reserved3_IRQn            =   4,              /*!<  Internal reserved */
+    Reserved4_IRQn            =   5,              /*!<  Internal reserved */
+    Reserved5_IRQn            =   6,              /*!<  Internal reserved */
+    SysTimer_IRQn             =   7,              /*!<  System Timer Interrupt */
+    Reserved6_IRQn            =   8,              /*!<  Internal reserved */
+    Reserved7_IRQn            =   9,              /*!<  Internal reserved */
+    Reserved8_IRQn            =  10,              /*!<  Internal reserved */
+    Reserved9_IRQn            =  11,              /*!<  Internal reserved */
+    Reserved10_IRQn           =  12,              /*!<  Internal reserved */
+    Reserved11_IRQn           =  13,              /*!<  Internal reserved */
+    Reserved12_IRQn           =  14,              /*!<  Internal reserved */
+    Reserved13_IRQn           =  15,              /*!<  Internal reserved */
+    Reserved14_IRQn           =  16,              /*!<  Internal reserved */
+    Reserved15_IRQn           =  17,              /*!<  Internal reserved */
+    Reserved16_IRQn           =  18,              /*!<  Internal reserved */
+
+/* ===========================================  hbird Specific Interrupt Numbers  ========================================= */
+/* ToDo: add here your device specific external interrupt numbers. 19~1023 is reserved number for user. Maxmum interrupt supported
+         could get from clicinfo.NUM_INTERRUPT. According the interrupt handlers defined in startup_Device.s
+         eg.: Interrupt for Timer#1       eclic_tim1_handler   ->   TIM1_IRQn */
+    SOC_INT19_IRQn           = 19,                /*!< Device Interrupt */
+    SOC_INT20_IRQn           = 20,                /*!< Device Interrupt */
+    SOC_INT21_IRQn           = 21,                /*!< Device Interrupt */
+    SOC_INT22_IRQn           = 22,                /*!< Device Interrupt */
+    SOC_INT23_IRQn           = 23,                /*!< Device Interrupt */
+    SOC_INT24_IRQn           = 24,                /*!< Device Interrupt */
+    SOC_INT25_IRQn           = 25,                /*!< Device Interrupt */
+    SOC_INT26_IRQn           = 26,                /*!< Device Interrupt */
+    SOC_INT27_IRQn           = 27,                /*!< Device Interrupt */
+    SOC_INT28_IRQn           = 28,                /*!< Device Interrupt */
+    SOC_INT29_IRQn           = 29,                /*!< Device Interrupt */
+    SOC_INT30_IRQn           = 30,                /*!< Device Interrupt */
+    SOC_INT31_IRQn           = 31,                /*!< Device Interrupt */
+    SOC_INT32_IRQn           = 32,                /*!< Device Interrupt */
+    SOC_INT33_IRQn           = 33,                /*!< Device Interrupt */
+    SOC_INT34_IRQn           = 34,                /*!< Device Interrupt */
+    SOC_INT35_IRQn           = 35,                /*!< Device Interrupt */
+    SOC_INT36_IRQn           = 36,                /*!< Device Interrupt */
+    SOC_INT37_IRQn           = 37,                /*!< Device Interrupt */
+    SOC_INT38_IRQn           = 38,                /*!< Device Interrupt */
+    SOC_INT39_IRQn           = 39,                /*!< Device Interrupt */
+    SOC_INT40_IRQn           = 40,                /*!< Device Interrupt */
+    SOC_INT41_IRQn           = 41,                /*!< Device Interrupt */
+    SOC_INT42_IRQn           = 42,                /*!< Device Interrupt */
+    SOC_INT43_IRQn           = 43,                /*!< Device Interrupt */
+    SOC_INT44_IRQn           = 44,                /*!< Device Interrupt */
+    SOC_INT45_IRQn           = 45,                /*!< Device Interrupt */
+    SOC_INT46_IRQn           = 46,                /*!< Device Interrupt */
+    SOC_INT47_IRQn           = 47,                /*!< Device Interrupt */
+    SOC_INT48_IRQn           = 48,                /*!< Device Interrupt */
+    SOC_INT49_IRQn           = 49,                /*!< Device Interrupt */
+    SOC_INT50_IRQn           = 50,                /*!< Device Interrupt */
+    SOC_INT_MAX,
+} IRQn_Type;
+
+/* =========================================================================================================================== */
+/* ================                                  Exception Code Definition                                ================ */
+/* =========================================================================================================================== */
+
+typedef enum EXCn {
+/* =======================================  Nuclei N/NX Specific Exception Code  ======================================== */
+    InsUnalign_EXCn          =   0,              /*!<  Instruction address misaligned */
+    InsAccFault_EXCn         =   1,              /*!<  Instruction access fault */
+    IlleIns_EXCn             =   2,              /*!<  Illegal instruction */
+    Break_EXCn               =   3,              /*!<  Beakpoint */
+    LdAddrUnalign_EXCn       =   4,              /*!<  Load address misaligned */
+    LdFault_EXCn             =   5,              /*!<  Load access fault */
+    StAddrUnalign_EXCn       =   6,              /*!<  Store or AMO address misaligned */
+    StAccessFault_EXCn       =   7,              /*!<  Store or AMO access fault */
+    UmodeEcall_EXCn          =   8,              /*!<  Environment call from User mode */
+    MmodeEcall_EXCn          =  11,              /*!<  Environment call from Machine mode */
+    NMI_EXCn                 = 0xfff,            /*!<  NMI interrupt */
+} EXCn_Type;
+
+/* =========================================================================================================================== */
+/* ================                           Processor and Core Peripheral Section                           ================ */
+/* =========================================================================================================================== */
+
+/* ToDo: set the defines according your Device */
+/* ToDo: define the correct core revision */
+#if __riscv_xlen == 32
+
+#ifndef __NUCLEI_CORE_REV
+#define __NUCLEI_N_REV            0x0104    /*!< Core Revision r1p4 */
+#else
+#define __NUCLEI_N_REV            __NUCLEI_CORE_REV
+#endif
+
+#elif __riscv_xlen == 64
+
+#ifndef __NUCLEI_CORE_REV
+#define __NUCLEI_NX_REV           0x0100    /*!< Core Revision r1p0 */
+#else
+#define __NUCLEI_NX_REV           __NUCLEI_CORE_REV
+#endif
+
+#endif /* __riscv_xlen == 64 */
+
+/* ToDo: define the correct core features for the hbird */
+#define __ECLIC_PRESENT           1                     /*!< Set to 1 if ECLIC is present */
+#define __ECLIC_BASEADDR          0x0C000000UL          /*!< Set to ECLIC baseaddr of your device */
+
+//#define __ECLIC_INTCTLBITS        3                     /*!< Set to 1 - 8, the number of hardware bits are actually implemented in the clicintctl registers. */
+#define __ECLIC_INTNUM            51                    /*!< Set to 1 - 1024, total interrupt number of ECLIC Unit */
+#define __SYSTIMER_PRESENT        1                     /*!< Set to 1 if System Timer is present */
+#define __SYSTIMER_BASEADDR       0x02000000UL          /*!< Set to SysTimer baseaddr of your device */
+
+/*!< Set to 0, 1, or 2, 0 not present, 1 single floating point unit present, 2 double floating point unit present */
+#if !defined(__riscv_flen)
+#define __FPU_PRESENT             0
+#elif __riscv_flen == 32
+#define __FPU_PRESENT             1
+#else
+#define __FPU_PRESENT             2
+#endif
+
+#define __DSP_PRESENT             1                     /*!< Set to 1 if DSP is present */
+#define __PMP_PRESENT             1                     /*!< Set to 1 if PMP is present */
+#define __PMP_ENTRY_NUM           16                    /*!< Set to 8 or 16, the number of PMP entries */
+#define __ICACHE_PRESENT          0                     /*!< Set to 1 if I-Cache is present */
+#define __DCACHE_PRESENT          0                     /*!< Set to 1 if D-Cache is present */
+#define __Vendor_SysTickConfig    0                     /*!< Set to 1 if different SysTick Config is used */
+#define __Vendor_EXCEPTION        0                     /*!< Set to 1 if vendor exception hander is present */
+
+/** @} */ /* End of group Configuration_of_CMSIS */
+
+
+#include <nmsis_core.h>                         /*!< Nuclei N/NX class processor and core peripherals */
+/* ToDo: include your system_hbird.h file
+         replace 'Device' with your device name */
+#include "system_hbird.h"                    /*!< hbird System */
+
+
+/* ========================================  Start of section using anonymous unions  ======================================== */
+#if   defined (__GNUC__)
+  /* anonymous unions are enabled by default */
+#else
+  #warning Not supported compiler type
+#endif
+
+#define RTC_FREQ                    32768
+// The TIMER frequency is just the RTC frequency
+#define SOC_TIMER_FREQ              RTC_FREQ
+/* =========================================================================================================================== */
+/* ================                            Device Specific Peripheral Section                             ================ */
+/* =========================================================================================================================== */
+
+
+/** @addtogroup Device_Peripheral_peripherals
+  * @{
+  */
+
+/****************************************************************************
+ * Platform definitions
+ *****************************************************************************/
+// IOF Mappings
+#define IOF0_SPI1_MASK              _AC(0x000007FC,UL)
+#define SPI11_NUM_SS                (4)
+#define IOF_SPI1_SS0                (2u)
+#define IOF_SPI1_SS1                (8u)
+#define IOF_SPI1_SS2                (9u)
+#define IOF_SPI1_SS3                (10u)
+#define IOF_SPI1_MOSI               (3u)
+#define IOF_SPI1_MISO               (4u)
+#define IOF_SPI1_SCK                (5u)
+#define IOF_SPI1_DQ0                (3u)
+#define IOF_SPI1_DQ1                (4u)
+#define IOF_SPI1_DQ2                (6u)
+#define IOF_SPI1_DQ3                (7u)
+
+#define IOF0_SPI2_MASK              _AC(0xFC000000,UL)
+#define SPI2_NUM_SS                 (1)
+#define IOF_SPI2_SS0                (26u)
+#define IOF_SPI2_MOSI               (27u)
+#define IOF_SPI2_MISO               (28u)
+#define IOF_SPI2_SCK                (29u)
+#define IOF_SPI2_DQ0                (27u)
+#define IOF_SPI2_DQ1                (28u)
+#define IOF_SPI2_DQ2                (30u)
+#define IOF_SPI2_DQ3                (31u)
+
+#define IOF0_UART0_MASK             _AC(0x00030000, UL)
+#define IOF_UART0_RX                (16u)
+#define IOF_UART0_TX                (17u)
+
+#define IOF0_UART1_MASK             _AC(0x03000000, UL)
+#define IOF_UART1_RX                (24u)
+#define IOF_UART1_TX                (25u)
+
+#define IOF0_I2C_MASK               _AC(0x00003000, UL)
+#define IOF_I2C_SDA                 (12u)
+#define IOF_I2C_SCL                 (13u)
+
+#define IOF1_PWM0_MASK              _AC(0x0000000F, UL)
+#define IOF1_PWM1_MASK              _AC(0x00780000, UL)
+#define IOF1_PWM2_MASK              _AC(0x00003C00, UL)
+
+// Interrupt Numbers
+#define SOC_ECLIC_NUM_INTERRUPTS    32
+#define SOC_ECLIC_INT_GPIO_BASE     19
+
+// Interrupt Handler Definitions
+#define SOC_MTIMER_HANDLER          eclic_mtip_handler
+#define SOC_SOFTINT_HANDLER         eclic_msip_handler
+
+#define GPIO_BIT_ALL_ZERO           (0x0)
+#define GPIO_BIT_ALL_ONE            (0xFFFFFFFF)
+
+/**
+  * @brief GPIO
+  */
+typedef struct {  /*!< GPIO Structure */
+    __IOM uint32_t INPUT_VAL;
+    __IOM uint32_t INPUT_EN;
+    __IOM uint32_t OUTPUT_EN;
+    __IOM uint32_t OUTPUT_VAL;
+    __IOM uint32_t PULLUP_EN;
+    __IOM uint32_t DRIVE;
+    __IOM uint32_t RISE_IE;
+    __IOM uint32_t RISE_IP;
+    __IOM uint32_t FALL_IE;
+    __IOM uint32_t FALL_IP;
+    __IOM uint32_t HIGH_IE;
+    __IOM uint32_t HIGH_IP;
+    __IOM uint32_t LOW_IE;
+    __IOM uint32_t LOW_IP;
+    __IOM uint32_t IOF_EN;
+    __IOM uint32_t IOF_SEL;
+    __IOM uint32_t OUTPUT_XOR;
+} GPIO_TypeDef;
+
+/**
+  * @brief UART
+  */
+typedef struct {
+    __IOM uint32_t TXFIFO;
+    __IOM uint32_t RXFIFO;
+    __IOM uint32_t TXCTRL;
+    __IOM uint32_t RXCTRL;
+    __IOM uint32_t IE;
+    __IOM uint32_t IP;
+    __IOM uint32_t DIV;
+} UART_TypeDef;
+
+/**
+  * @brief PWM
+  */
+typedef struct {
+    __IOM uint32_t CFG;
+          uint32_t RESERVED0;
+    __IOM uint32_t COUNT;
+          uint32_t RESERVED1;
+    __IOM uint32_t S;
+          uint32_t RESERVED2[3];
+    __IOM uint32_t CMP0;
+    __IOM uint32_t CMP1;
+    __IOM uint32_t CMP2;
+    __IOM uint32_t CMP3;
+} PWM_TypeDef;
+
+/**
+  * @brief QSPI
+  */
+typedef struct {
+    __IOM uint32_t SCKDIV;
+    __IOM uint32_t SCKMODE;
+    __IOM uint32_t RESERVED0[2];
+    __IOM uint32_t CSID;
+    __IOM uint32_t CSDEF;
+    __IOM uint32_t CSMODE;
+    __IOM uint32_t RESERVED1[3];
+    __IOM uint32_t DELAY0;
+    __IOM uint32_t DELAY1;
+    __IOM uint32_t RESERVED2[4];
+    __IOM uint32_t FMT;
+    __IOM uint32_t RESERVED3;
+    __IOM uint32_t TXDATA;
+    __IOM uint32_t RXDATA;
+    __IOM uint32_t TXMARK;
+    __IOM uint32_t RXMARK;
+    __IOM uint32_t RESERVED4[2];
+    __IOM uint32_t FCTRL;
+    __IOM uint32_t FFMT;
+    __IOM uint32_t RESERVED5[2];
+    __IOM uint32_t IE;
+    __IOM uint32_t IP;
+} QSPI_TypeDef;
+
+/**
+  * @brief I2C
+  */
+typedef struct {
+    __IOM uint8_t PRERlo;
+    __IOM uint8_t PRERhi;
+    __IOM uint8_t CTR;
+    __IOM uint8_t TXRXR; /* TXR and RXR in same address */
+    __IOM uint8_t CSR; /* CR and SR in same address */
+} I2C_TypeDef;
+
+/*@}*/ /* end of group hbird_Peripherals */
+
+
+/* =========================================  End of section using anonymous unions  ========================================= */
+#if defined (__GNUC__)
+  /* anonymous unions are enabled by default */
+#else
+  #warning Not supported compiler type
+#endif
+
+
+/* =========================================================================================================================== */
+/* ================                          Device Specific Peripheral Address Map                           ================ */
+/* =========================================================================================================================== */
+
+
+/* ToDo: add here your device peripherals base addresses
+         following is an example for timer */
+/** @addtogroup Device_Peripheral_peripheralAddr
+  * @{
+  */
+/* Peripheral and SRAM base address */
+#define QSPI_FLASH_BASE         (0x20000000UL)      /*!< (FLASH     ) Base Address */
+#define ONCHIP_ROM_BASE         (0x00001000UL)      /*!< (ROM       ) Base Address */
+#define ONCHIP_ILM_BASE         (0x80000000UL)      /*!< (ILM       ) Base Address */
+#define ONCHIP_DLM_BASE         (0x90000000UL)      /*!< (DLM       ) Base Address */
+#define HBIRD_PERIPH_BASE       (0x10000000UL)      /*!< (Peripheral) Base Address */
+
+/* Peripheral memory map */
+/* Fast-IO Interfaced IP */
+#define GPIO_BASE               (HBIRD_PERIPH_BASE + 0x12000)          /*!< (GPIO) Base Address */
+/* PPI Interfaced IP */
+#define UART0_BASE              (HBIRD_PERIPH_BASE + 0x13000)          /*!< (UART0) Base Address */
+#define QSPI0_BASE              (HBIRD_PERIPH_BASE + 0x14000)          /*!< (QSPI0) Base Address */
+#define PWM0_BASE               (HBIRD_PERIPH_BASE + 0x15000)          /*!< (PWM0) Base Address */
+#define UART1_BASE              (HBIRD_PERIPH_BASE + 0x23000)          /*!< (UART1) Base Address */
+#define QSPI1_BASE              (HBIRD_PERIPH_BASE + 0x24000)          /*!< (QSPI1) Base Address */
+#define PWM1_BASE               (HBIRD_PERIPH_BASE + 0x25000)          /*!< (PWM1) Base Address */
+#define QSPI2_BASE              (HBIRD_PERIPH_BASE + 0x34000)          /*!< (QSPI2) Base Address */
+#define PWM2_BASE               (HBIRD_PERIPH_BASE + 0x35000)          /*!< (PWM2) Base Address */
+#define I2C_BASE                (HBIRD_PERIPH_BASE + 0x42000)          /*!< (I2C Master) Base Address */
+
+/** @} */ /* End of group Device_Peripheral_peripheralAddr */
+
+
+/* =========================================================================================================================== */
+/* ================                                  Peripheral declaration                                   ================ */
+/* =========================================================================================================================== */
+
+
+/* ToDo: add here your device peripherals pointer definitions
+         following is an example for timer */
+/** @addtogroup Device_Peripheral_declaration
+  * @{
+  */
+#define GPIO                    ((GPIO_TypeDef *) GPIO_BASE)
+#define UART0                   ((UART_TypeDef *) UART0_BASE)
+#define QSPI0                   ((QSPI_TypeDef *) QSPI0_BASE)
+#define PWM0                    ((PWM_TypeDef *) PWM0_BASE)
+#define UART1                   ((UART_TypeDef *) UART1_BASE)
+#define QSPI1                   ((QSPI_TypeDef *) QSPI1_BASE)
+#define PWM1                    ((PWM_TypeDef *) PWM1_BASE)
+#define QSPI2                   ((QSPI_TypeDef *) QSPI2_BASE)
+#define PWM2                    ((PWM_TypeDef *) PWM2_BASE)
+#define I2C                     ((I2C_TypeDef *) I2C_BASE)
+
+// Helper functions
+#define _REG8(p, i)             (*(volatile uint8_t *) ((p) + (i)))
+#define _REG32(p, i)            (*(volatile uint32_t *) ((p) + (i)))
+#define _REG32P(p, i)           ((volatile uint32_t *) ((p) + (i)))
+
+#define GPIO_REG(offset)        _REG32(GPIO_BASE, offset)
+#define PWM0_REG(offset)        _REG32(PWM0_BASE, offset)
+#define PWM1_REG(offset)        _REG32(PWM1_BASE, offset)
+#define PWM2_REG(offset)        _REG32(PWM2_BASE, offset)
+#define SPI0_REG(offset)        _REG32(QSPI0_BASE, offset)
+#define SPI1_REG(offset)        _REG32(QSPI1_BASE, offset)
+#define SPI2_REG(offset)        _REG32(QSPI2_BASE, offset)
+#define UART0_REG(offset)       _REG32(UART0_BASE, offset)
+#define UART1_REG(offset)       _REG32(UART1_BASE, offset)
+#define I2C_REG(offset)         _REG8(I2C_BASE, offset)
+
+// Misc
+
+#define NUM_GPIO 32
+
+extern uint32_t get_cpu_freq(void);
+extern void delay_1ms(uint32_t count);
+
+/** @} */ /* End of group hbird */
+
+/** @} */ /* End of group Nuclei */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __HBIRD_H__ */
diff --git a/external/platform/hbird/NMSIS/rules.mk b/external/platform/hbird/NMSIS/rules.mk
new file mode 100644
index 000000000..b28200a14
--- /dev/null
+++ b/external/platform/hbird/NMSIS/rules.mk
@@ -0,0 +1,4 @@
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+GLOBAL_INCLUDES += $(LOCAL_DIR)
+
diff --git a/external/platform/hbird/NMSIS/system_hbird.h b/external/platform/hbird/NMSIS/system_hbird.h
new file mode 100644
index 000000000..0c81afbab
--- /dev/null
+++ b/external/platform/hbird/NMSIS/system_hbird.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*******************************************************************************
+ * @file     system_hbird.h
+ * @brief    NMSIS Nuclei N/NX Device Peripheral Access Layer Header File for
+ *           Device <Device>
+ * @version  V1.00
+ * @date     17. Dec 2019
+ ******************************************************************************/
+
+#ifndef __SYSTEM_HBIRD_H__   /* ToDo: replace '<Device>' with your device name */
+#define __SYSTEM_HBIRD_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+extern uint32_t SystemCoreClock;     /*!< System Clock Frequency (Core Clock)  */
+
+/**
+ * \brief Setup the microcontroller system.
+ * \details
+ * Initialize the System and update the SystemCoreClock variable.
+ */
+extern void SystemInit (void);
+
+/**
+ * \brief  Update SystemCoreClock variable.
+ * \details
+ * Updates the SystemCoreClock with current core Clock retrieved from cpu registers.
+ */
+extern void SystemCoreClockUpdate (void);
+
+/** 
+ * \brief Register an exception handler for exception code EXCn
+ */
+extern void Exception_Register_EXC(uint32_t EXCn, unsigned long exc_handler);
+
+/** 
+ * \brief Get current exception handler for exception code EXCn
+ */
+extern unsigned long Exception_Get_EXC(uint32_t EXCn);
+
+/**
+ * \brief Initialize eclic config
+ */
+extern void ECLIC_Init(void);
+
+/**
+ * \brief  Initialize a specific IRQ and register the handler
+ * \details
+ * This function set vector mode, trigger mode and polarity, interrupt level and priority,
+ * assign handler for specific IRQn.
+ */
+extern int32_t ECLIC_Register_IRQ(IRQn_Type IRQn, uint8_t shv, ECLIC_TRIGGER_Type trig_mode, uint8_t lvl, uint8_t priority, void *handler);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __SYSTEM_HBIRD_H__ */
diff --git a/external/platform/hbird/inc/hbird_gpio.h b/external/platform/hbird/inc/hbird_gpio.h
new file mode 100644
index 000000000..f6c880962
--- /dev/null
+++ b/external/platform/hbird/inc/hbird_gpio.h
@@ -0,0 +1,56 @@
+// See LICENSE for license details.
+#ifndef _HBIRD_GPIO_H
+#define _HBIRD_GPIO_H
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#define GPIO_INPUT_VAL      (0x00)
+#define GPIO_INPUT_EN       (0x04)
+#define GPIO_OUTPUT_EN      (0x08)
+#define GPIO_OUTPUT_VAL     (0x0C)
+#define GPIO_PULLUP_EN      (0x10)
+#define GPIO_DRIVE          (0x14)
+#define GPIO_RISE_IE        (0x18)
+#define GPIO_RISE_IP        (0x1C)
+#define GPIO_FALL_IE        (0x20)
+#define GPIO_FALL_IP        (0x24)
+#define GPIO_HIGH_IE        (0x28)
+#define GPIO_HIGH_IP        (0x2C)
+#define GPIO_LOW_IE         (0x30)
+#define GPIO_LOW_IP         (0x34)
+#define GPIO_IOF_EN         (0x38)
+#define GPIO_IOF_SEL        (0x3C)
+#define GPIO_OUTPUT_XOR     (0x40)
+
+typedef enum iof_func {
+    IOF_SEL_GPIO = 0,
+    IOF_SEL_0 = 1,
+    IOF_SEL_1 = 2
+} IOF_FUNC;
+
+typedef enum gpio_int_type {
+    GPIO_INT_RISE = 0,
+    GPIO_INT_FALL = 1,
+    GPIO_INT_HIGH = 2,
+    GPIO_INT_LOW = 3
+} GPIO_INT_TYPE;
+
+int32_t gpio_iof_config(GPIO_TypeDef *gpio, uint32_t mask, IOF_FUNC func);
+int32_t gpio_enable_output(GPIO_TypeDef *gpio, uint32_t mask);
+int32_t gpio_enable_input(GPIO_TypeDef *gpio, uint32_t mask);
+int32_t gpio_write(GPIO_TypeDef *gpio, uint32_t mask, uint32_t value);
+int32_t gpio_toggle(GPIO_TypeDef *gpio, uint32_t mask);
+int32_t gpio_read(GPIO_TypeDef *gpio, uint32_t mask);
+int32_t gpio_set_pue(GPIO_TypeDef *gpio, uint32_t mask, uint32_t value);
+int32_t gpio_set_ds(GPIO_TypeDef *gpio, uint32_t mask, uint32_t value);
+int32_t gpio_set_outxor(GPIO_TypeDef *gpio, uint32_t mask, uint32_t value);
+int32_t gpio_enable_interrupt(GPIO_TypeDef *gpio, uint32_t mask, GPIO_INT_TYPE type);
+int32_t gpio_disable_interrupt(GPIO_TypeDef *gpio, uint32_t mask, GPIO_INT_TYPE type);
+int32_t gpio_clear_interrupt(GPIO_TypeDef *gpio, uint32_t mask, GPIO_INT_TYPE type);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _HBIRD_GPIO_H */
diff --git a/external/platform/hbird/inc/hbird_uart.h b/external/platform/hbird/inc/hbird_uart.h
new file mode 100644
index 000000000..4af314cb6
--- /dev/null
+++ b/external/platform/hbird/inc/hbird_uart.h
@@ -0,0 +1,75 @@
+// See LICENSE for license details.
+
+#ifndef _HBIRD_UART_H
+#define _HBIRD_UART_H
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* Register offsets */
+#define UART_REG_TXFIFO         0x00
+#define UART_REG_RXFIFO         0x04
+#define UART_REG_TXCTRL         0x08
+#define UART_REG_RXCTRL         0x0c
+#define UART_REG_IE             0x10
+#define UART_REG_IP             0x14
+#define UART_REG_DIV            0x18
+
+/* TXCTRL register */
+#define UART_TXEN               0x1
+#define UART_TXWM(x)            (((x) & 0xffff) << 16)
+
+/* RXCTRL register */
+#define UART_RXEN               0x1
+#define UART_RXWM(x)            (((x) & 0xffff) << 16)
+
+/* IP register */
+#define UART_IP_TXWM            0x1
+#define UART_IP_RXWM            0x2
+
+#define UART_TXFIFO_FULL        (1<<31)
+#define UART_RXFIFO_EMPTY       (1<<31)
+
+#define UART_TXCTRL_TXCNT_OFS   (16)
+#define UART_TXCTRL_TXCNT_MASK  (0x7 << UART_TXCTRL_TXCNT_OFS)
+#define UART_TXCTRL_TXEN_OFS    (0)
+#define UART_TXCTRL_TXEN_MASK   (0x1 << UART_TXCTRL_TXEN_OFS)
+#define UART_TXCTRL_NSTOP_OFS   (1)
+#define UART_TXCTRL_NSTOP_MASK  (0x1 << UART_TXCTRL_TXEN_OFS)
+
+#define UART_RXCTRL_RXCNT_OFS   (16)
+#define UART_RXCTRL_RXCNT_MASK  (0x7 << UART_RXCTRL_RXCNT_OFS)
+#define UART_RXCTRL_RXEN_OFS    (0)
+#define UART_RXCTRL_RXEN_MASK   (0x1 << UART_RXCTRL_RXEN_OFS)
+
+#define UART_IE_TXIE_OFS        (0)
+#define UART_IE_TXIE_MASK       (0x1 << UART_IE_TXIE_OFS)
+#define UART_IE_RXIE_OFS        (1)
+#define UART_IE_RXIE_MASK       (0x1 << UART_IE_RXIE_OFS)
+
+#define UART_IP_TXIP_OFS        (0)
+#define UART_IP_TXIP_MASK       (0x1 << UART_IP_TXIP_OFS)
+#define UART_IP_RXIP_OFS        (1)
+#define UART_IP_RXIP_MASK       (0x1 << UART_IP_RXIP_OFS)
+
+typedef enum uart_stop_bit {
+    UART_STOP_BIT_1 = 0,
+    UART_STOP_BIT_2 = 1
+} UART_STOP_BIT;
+
+int32_t uart_init(UART_TypeDef *uart, uint32_t baudrate);
+int32_t uart_config_stopbit(UART_TypeDef *uart, UART_STOP_BIT stopbit);
+int32_t uart_write(UART_TypeDef *uart, uint8_t val);
+uint8_t uart_read(UART_TypeDef *uart);
+int32_t uart_set_tx_watermark(UART_TypeDef *uart, uint32_t watermark);
+int32_t uart_enable_txint(UART_TypeDef *uart);
+int32_t uart_disable_txint(UART_TypeDef *uart);
+int32_t uart_set_rx_watermark(UART_TypeDef *uart, uint32_t watermark);
+int32_t uart_enable_rxint(UART_TypeDef *uart);
+int32_t uart_disable_rxint(UART_TypeDef *uart);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _HBIRD_UART_H */
diff --git a/external/platform/hbird/inc/nuclei_sdk_soc.h b/external/platform/hbird/inc/nuclei_sdk_soc.h
new file mode 100644
index 000000000..eb46f0243
--- /dev/null
+++ b/external/platform/hbird/inc/nuclei_sdk_soc.h
@@ -0,0 +1,17 @@
+// See LICENSE for license details.
+#ifndef _NUCLEI_SDK_SOC_H
+#define _NUCLEI_SDK_SOC_H
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#include "hbird.h"
+#include "hbird_uart.h"
+#include "hbird_gpio.h"
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/external/platform/hbird/rules.mk b/external/platform/hbird/rules.mk
new file mode 100644
index 000000000..455925f80
--- /dev/null
+++ b/external/platform/hbird/rules.mk
@@ -0,0 +1,15 @@
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+MODULE := $(LOCAL_DIR)
+
+GLOBAL_INCLUDES += $(LOCAL_DIR)/inc
+
+MODULE_SRCS += \
+	$(LOCAL_DIR)/src/hbird_common.c \
+	$(LOCAL_DIR)/src/hbird_gpio.c \
+	$(LOCAL_DIR)/src/hbird_uart.c \
+	$(LOCAL_DIR)/src/system_hbird.c
+
+include $(LOCAL_DIR)/NMSIS/rules.mk
+
+include make/module.mk
diff --git a/external/platform/hbird/src/hbird_common.c b/external/platform/hbird/src/hbird_common.c
new file mode 100644
index 000000000..20c764c52
--- /dev/null
+++ b/external/platform/hbird/src/hbird_common.c
@@ -0,0 +1,69 @@
+#include "nuclei_sdk_soc.h"
+
+static uint32_t get_timer_freq()
+{
+    return SOC_TIMER_FREQ;
+}
+
+uint32_t measure_cpu_freq(uint32_t n)
+{
+    uint32_t start_mcycle, delta_mcycle;
+    uint32_t start_mtime, delta_mtime;
+    uint32_t mtime_freq = get_timer_freq();
+
+    // Don't start measuruing until we see an mtime tick
+    uint32_t tmp = (uint32_t)SysTimer_GetLoadValue();
+    do {
+        start_mtime = (uint32_t)SysTimer_GetLoadValue();
+        start_mcycle = __RV_CSR_READ(CSR_MCYCLE);
+    } while (start_mtime == tmp);
+
+    do {
+        delta_mtime = (uint32_t)SysTimer_GetLoadValue() - start_mtime;
+        delta_mcycle = __RV_CSR_READ(CSR_MCYCLE) - start_mcycle;
+    } while (delta_mtime < n);
+
+    return (delta_mcycle / delta_mtime) * mtime_freq
+           + ((delta_mcycle % delta_mtime) * mtime_freq) / delta_mtime;
+}
+
+uint32_t get_cpu_freq(void)
+{
+    uint32_t cpu_freq;
+
+    // warm up
+    measure_cpu_freq(1);
+    // measure for real
+    cpu_freq = measure_cpu_freq(100);
+
+    return cpu_freq;
+}
+
+/**
+ * \brief      delay a time in milliseconds
+ * \details
+ *             provide API for delay
+ * \param[in]  count: count in milliseconds
+ * \remarks
+ */
+void delay_1ms(uint32_t count)
+{
+    uint64_t start_mtime, delta_mtime;
+    uint64_t delay_ticks = (SOC_TIMER_FREQ * (uint64_t)count) / 1000;
+
+    start_mtime = SysTimer_GetLoadValue();
+
+    do {
+        delta_mtime = SysTimer_GetLoadValue() - start_mtime;
+    } while (delta_mtime < delay_ticks);
+}
+
+#ifdef SIMULATION_XLSPIKE
+// never return for xlspike
+void xlspike_exit(int status)
+{
+    // pass exit status via rxfifo register
+    UART0->RXFIFO = status;
+    uart_write(UART0, 4);
+}
+#endif
diff --git a/external/platform/hbird/src/hbird_gpio.c b/external/platform/hbird/src/hbird_gpio.c
new file mode 100644
index 000000000..1512f4463
--- /dev/null
+++ b/external/platform/hbird/src/hbird_gpio.c
@@ -0,0 +1,180 @@
+#include "hbird.h"
+#include "hbird_gpio.h"
+
+int32_t gpio_iof_config(GPIO_TypeDef *gpio, uint32_t mask, IOF_FUNC func)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    switch (func) {
+        case IOF_SEL_GPIO:
+            gpio->IOF_EN &= ~mask;
+            break;
+        case IOF_SEL_0:
+            gpio->IOF_SEL &= ~mask;
+            gpio->IOF_EN |= mask;
+            break;
+        case IOF_SEL_1:
+            gpio->IOF_SEL |= mask;
+            gpio->IOF_EN |= mask;
+            break;
+        default:
+            break;
+    }
+    return 0;
+}
+
+int32_t gpio_enable_output(GPIO_TypeDef *gpio, uint32_t mask)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    gpio->OUTPUT_EN |= mask;
+    gpio->INPUT_EN &= ~mask;
+    return 0;
+}
+
+int32_t gpio_enable_input(GPIO_TypeDef *gpio, uint32_t mask)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    gpio->INPUT_EN |= mask;
+    gpio->OUTPUT_EN &= ~mask;
+    return 0;
+}
+
+int32_t gpio_write(GPIO_TypeDef *gpio, uint32_t mask, uint32_t value)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    // If value != 0, mean set gpio pin high, otherwise set pin low
+    if (value) {
+        gpio->OUTPUT_VAL |= (mask);
+    } else {
+        gpio->OUTPUT_VAL &= ~(mask);
+    }
+    return 0;
+}
+
+int32_t gpio_toggle(GPIO_TypeDef *gpio, uint32_t mask)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    gpio->OUTPUT_VAL = (mask ^ gpio->OUTPUT_VAL);
+    return 0;
+}
+
+
+int32_t gpio_read(GPIO_TypeDef *gpio, uint32_t mask)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    return gpio->INPUT_VAL & mask;
+}
+
+int32_t gpio_set_pue(GPIO_TypeDef *gpio, uint32_t mask, uint32_t value)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    mask = gpio->PULLUP_EN & (~mask);
+    gpio->PULLUP_EN = (mask | value);
+    return 0;
+}
+
+int32_t gpio_set_ds(GPIO_TypeDef *gpio, uint32_t mask, uint32_t value)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    mask = gpio->DRIVE & (~mask);
+    gpio->DRIVE = (mask | value);
+    return 0;
+}
+
+int32_t gpio_set_outxor(GPIO_TypeDef *gpio, uint32_t mask, uint32_t value)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    mask = gpio->OUTPUT_XOR & (~mask);
+    gpio->OUTPUT_XOR = (mask | value);
+    return 0;
+}
+
+int32_t gpio_enable_interrupt(GPIO_TypeDef *gpio, uint32_t mask, GPIO_INT_TYPE type)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    switch (type) {
+        case GPIO_INT_RISE:
+            gpio->RISE_IE |= mask;
+            break;
+        case GPIO_INT_FALL:
+            gpio->FALL_IE |= mask;
+            break;
+        case GPIO_INT_HIGH:
+            gpio->HIGH_IE |= mask;
+            break;
+        case GPIO_INT_LOW:
+            gpio->LOW_IE |= mask;
+            break;
+        default:
+            break;
+    }
+    return 0;
+}
+
+int32_t gpio_disable_interrupt(GPIO_TypeDef *gpio, uint32_t mask, GPIO_INT_TYPE type)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    switch (type) {
+        case GPIO_INT_RISE:
+            gpio->RISE_IE &= ~mask;
+            break;
+        case GPIO_INT_FALL:
+            gpio->FALL_IE &= ~mask;
+            break;
+        case GPIO_INT_HIGH:
+            gpio->HIGH_IE &= ~mask;
+            break;
+        case GPIO_INT_LOW:
+            gpio->LOW_IE &= ~mask;
+            break;
+        default:
+            break;
+    }
+    return 0;
+}
+
+int32_t gpio_clear_interrupt(GPIO_TypeDef *gpio, uint32_t mask, GPIO_INT_TYPE type)
+{
+    if (__RARELY(gpio == NULL)) {
+        return -1;
+    }
+    switch (type) {
+        case GPIO_INT_RISE:
+            gpio->RISE_IP |= mask;
+            break;
+        case GPIO_INT_FALL:
+            gpio->FALL_IP |= mask;
+            break;
+        case GPIO_INT_HIGH:
+            gpio->HIGH_IP |= mask;
+            break;
+        case GPIO_INT_LOW:
+            gpio->LOW_IP |= mask;
+            break;
+        default:
+            break;
+    }
+    return 0;
+}
+
diff --git a/external/platform/hbird/src/hbird_uart.c b/external/platform/hbird/src/hbird_uart.c
new file mode 100644
index 000000000..d2ba68405
--- /dev/null
+++ b/external/platform/hbird/src/hbird_uart.c
@@ -0,0 +1,105 @@
+#include "hbird.h"
+#include "hbird_uart.h"
+
+int32_t uart_init(UART_TypeDef *uart, uint32_t baudrate)
+{
+    if (__RARELY(uart == NULL)) {
+        return -1;
+    }
+    uart->DIV = SystemCoreClock / baudrate - 1;
+    uart->TXCTRL |= UART_TXEN;
+    uart->RXCTRL |= UART_RXEN;
+    return 0;
+}
+
+int32_t uart_config_stopbit(UART_TypeDef *uart, UART_STOP_BIT stopbit)
+{
+    if (__RARELY(uart == NULL)) {
+        return -1;
+    }
+    uint32_t stopval = stopbit;
+    stopval = (stopbit << UART_TXCTRL_NSTOP_OFS) & UART_TXCTRL_TXCNT_MASK;
+    uart->TXCTRL &= stopval | (~UART_TXCTRL_TXCNT_MASK);
+    return 0;
+}
+
+int32_t uart_write(UART_TypeDef *uart, uint8_t val)
+{
+    if (__RARELY(uart == NULL)) {
+        return -1;
+    }
+#ifndef SIMULATION_XLSPIKE
+    while (uart->TXFIFO & UART_TXFIFO_FULL);
+#endif
+    uart->TXFIFO = val;
+    return 0;
+}
+
+uint8_t uart_read(UART_TypeDef *uart)
+{
+    uint32_t reg;
+    if (__RARELY(uart == NULL)) {
+        return -1;
+    }
+    do {
+        reg = uart->RXFIFO;
+    }
+    while (reg & UART_RXFIFO_EMPTY);
+    return (uint8_t)(reg & 0xFF);
+}
+
+int32_t uart_set_tx_watermark(UART_TypeDef *uart, uint32_t watermark)
+{
+    if (__RARELY(uart == NULL)) {
+        return -1;
+    }
+    watermark = (watermark << UART_TXCTRL_TXCNT_OFS) & UART_TXCTRL_TXCNT_MASK;
+    uart->TXCTRL &= watermark | (~UART_TXCTRL_TXCNT_MASK);
+    return 0;
+}
+
+int32_t uart_enable_txint(UART_TypeDef *uart)
+{
+    if (__RARELY(uart == NULL)) {
+        return -1;
+    }
+    uart->IE |= UART_IE_TXIE_MASK;
+    return 0;
+}
+
+int32_t uart_disable_txint(UART_TypeDef *uart)
+{
+    if (__RARELY(uart == NULL)) {
+        return -1;
+    }
+    uart->IE &= ~UART_IE_TXIE_MASK;
+    return 0;
+}
+
+int32_t uart_set_rx_watermark(UART_TypeDef *uart, uint32_t watermark)
+{
+    if (__RARELY(uart == NULL)) {
+        return -1;
+    }
+    watermark = (watermark << UART_RXCTRL_RXCNT_OFS) & UART_RXCTRL_RXCNT_MASK;
+    uart->RXCTRL &= watermark | (~UART_RXCTRL_RXCNT_MASK);
+    return 0;
+}
+
+int32_t uart_enable_rxint(UART_TypeDef *uart)
+{
+    if (__RARELY(uart == NULL)) {
+        return -1;
+    }
+    uart->IE |= UART_IE_RXIE_MASK;
+    return 0;
+}
+
+int32_t uart_disable_rxint(UART_TypeDef *uart)
+{
+    if (__RARELY(uart == NULL)) {
+        return -1;
+    }
+    uart->IE &= ~UART_IE_RXIE_MASK;
+    return 0;
+}
diff --git a/external/platform/hbird/src/system_hbird.c b/external/platform/hbird/src/system_hbird.c
new file mode 100644
index 000000000..8cb5f38a8
--- /dev/null
+++ b/external/platform/hbird/src/system_hbird.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ * Copyright (c) 2019 Nuclei Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/******************************************************************************
+ * @file     system_hbird.c
+ * @brief    NMSIS Nuclei Core Device Peripheral Access Layer Source File for
+ *           Nuclei HummingBird evaluation SoC which support Nuclei N/NX class cores
+ * @version  V1.00
+ * @date     22. Nov 2019
+ ******************************************************************************/
+#include <stdint.h>
+#include <stdio.h>
+#include "nuclei_sdk_hal.h"
+
+/*----------------------------------------------------------------------------
+  Define clocks
+ *----------------------------------------------------------------------------*/
+/* ToDo: add here your necessary defines for device initialization
+         following is an example for different system frequencies */
+#ifndef SYSTEM_CLOCK
+#define SYSTEM_CLOCK    (80000000UL)
+#endif
+
+/**
+ * \defgroup  NMSIS_Core_SystemConfig       System Device Configuration
+ * \brief Functions for system and clock setup available in system_<device>.c.
+ * \details
+ * Nuclei provides a template file **system_Device.c** that must be adapted by
+ * the silicon vendor to match their actual device. As a <b>minimum requirement</b>,
+ * this file must provide:
+ *  -  A device-specific system configuration function, \ref SystemInit.
+ *  -  A global variable that contains the system frequency, \ref SystemCoreClock.
+ *  -  A global eclic configuration initialization, \ref ECLIC_Init.
+ *  -  Global c library \ref _init and \ref _fini functions called right before calling main function.
+ *  -  Vendor customized interrupt, exception and nmi handling code, see \ref NMSIS_Core_IntExcNMI_Handling
+ *
+ * The file configures the device and, typically, initializes the oscillator (PLL) that is part
+ * of the microcontroller device. This file might export other functions or variables that provide
+ * a more flexible configuration of the microcontroller system.
+ *
+ * And this file also provided common interrupt, exception and NMI exception handling framework template,
+ * Silicon vendor can customize these template code as they want.
+ *
+ * \note Please pay special attention to the static variable \c SystemCoreClock. This variable might be
+ * used throughout the whole system initialization and runtime to calculate frequency/time related values.
+ * Thus one must assure that the variable always reflects the actual system clock speed.
+ *
+ * \attention
+ * Be aware that a value stored to \c SystemCoreClock during low level initialization (i.e. \c SystemInit()) might get
+ * overwritten by C libray startup code and/or .bss section initialization.
+ * Thus its highly recommended to call \ref SystemCoreClockUpdate at the beginning of the user \c main() routine.
+ *
+ * @{
+ */
+
+/*----------------------------------------------------------------------------
+  System Core Clock Variable
+ *----------------------------------------------------------------------------*/
+/* ToDo: initialize SystemCoreClock with the system core clock frequency value
+         achieved after system intitialization.
+         This means system core clock frequency after call to SystemInit() */
+/**
+ * \brief      Variable to hold the system core clock value
+ * \details
+ * Holds the system core clock, which is the system clock frequency supplied to the SysTick
+ * timer and the processor core clock. This variable can be used by debuggers to query the
+ * frequency of the debug timer or to configure the trace clock speed.
+ *
+ * \attention
+ * Compilers must be configured to avoid removing this variable in case the application
+ * program is not using it. Debugging systems require the variable to be physically
+ * present in memory so that it can be examined to configure the debugger.
+ */
+uint32_t SystemCoreClock = SYSTEM_CLOCK;  /* System Clock Frequency (Core Clock) */
+
+/*----------------------------------------------------------------------------
+  Clock functions
+ *----------------------------------------------------------------------------*/
+
+/**
+ * \brief      Function to update the variable \ref SystemCoreClock
+ * \details
+ * Updates the variable \ref SystemCoreClock and must be called whenever the core clock is changed
+ * during program execution. The function evaluates the clock register settings and calculates
+ * the current core clock.
+ */
+void SystemCoreClockUpdate (void)            /* Get Core Clock Frequency */
+{
+    /* ToDo: add code to calculate the system frequency based upon the current
+     *    register settings.
+     * Note: This function can be used to retrieve the system core clock frequeny
+     *    after user changed register settings.
+     */
+    SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/**
+ * \brief      Function to Initialize the system.
+ * \details
+ * Initializes the microcontroller system. Typically, this function configures the
+ * oscillator (PLL) that is part of the microcontroller device. For systems
+ * with a variable clock speed, it updates the variable \ref SystemCoreClock.
+ * SystemInit is called from the file <b>startup<i>_device</i></b>.
+ */
+void SystemInit (void)
+{
+    /* ToDo: add code to initialize the system
+     * Warn: do not use global variables because this function is called before
+     * reaching pre-main. RW section maybe overwritten afterwards.
+     */
+    SystemCoreClock = SYSTEM_CLOCK;
+}
+
+/**
+ * \defgroup  NMSIS_Core_IntExcNMI_Handling   Interrupt and Exception and NMI Handling
+ * \brief Functions for interrupt, exception and nmi handle available in system_<device>.c.
+ * \details
+ * Nuclei provide a template for interrupt, exception and NMI handling. Silicon Vendor could adapat according
+ * to their requirement. Silicon vendor could implement interface for different exception code and
+ * replace current implementation.
+ *
+ * @{
+ */
+/** \brief Max exception handler number, don't include the NMI(0xFFF) one */
+#define MAX_SYSTEM_EXCEPTION_NUM        12
+/**
+ * \brief      Store the exception handlers for each exception ID
+ * \note
+ * - This SystemExceptionHandlers are used to store all the handlers for all
+ * the exception codes Nuclei N/NX core provided.
+ * - Exception code 0 - 11, totally 12 exceptions are mapped to SystemExceptionHandlers[0:11]
+ * - Exception for NMI is also re-routed to exception handling(exception code 0xFFF) in startup code configuration, the handler itself is mapped to SystemExceptionHandlers[MAX_SYSTEM_EXCEPTION_NUM]
+ */
+static unsigned long SystemExceptionHandlers[MAX_SYSTEM_EXCEPTION_NUM+1];
+
+/**
+ * \brief      Exception Handler Function Typedef
+ * \note
+ * This typedef is only used internal in this system_<Device>.c file.
+ * It is used to do type conversion for registered exception handler before calling it.
+ */
+typedef void (*EXC_HANDLER) (unsigned long mcause, unsigned long sp);
+
+/**
+ * \brief      System Default Exception Handler
+ * \details
+ * This function provided a default exception and NMI handling code for all exception ids.
+ * By default, It will just print some information for debug, Vendor can customize it according to its requirements.
+ */
+static void system_default_exception_handler(unsigned long mcause, unsigned long sp)
+{
+    /* TODO: Uncomment this if you have implement printf function */
+    printf("MCAUSE : 0x%lx\r\n", mcause);
+    printf("MDCAUSE: 0x%lx\r\n", __RV_CSR_READ(CSR_MDCAUSE));
+    printf("MEPC   : 0x%lx\r\n", __RV_CSR_READ(CSR_MEPC));
+    printf("MTVAL  : 0x%lx\r\n", __RV_CSR_READ(CSR_MBADADDR));
+    printf("SP     : 0x%lx\r\n", sp);
+    while(1);
+}
+
+/**
+ * \brief      Initialize all the default core exception handlers
+ * \details
+ * The core exception handler for each exception id will be initialized to \ref system_default_exception_handler.
+ * \note
+ * Called in \ref _init function, used to initialize default exception handlers for all exception IDs
+ */
+static void Exception_Init(void)
+{
+    for (int i = 0; i < MAX_SYSTEM_EXCEPTION_NUM+1; i++) {
+        SystemExceptionHandlers[i] = (unsigned long)system_default_exception_handler;
+    }
+}
+
+/**
+ * \brief       Register an exception handler for exception code EXCn
+ * \details
+ * * For EXCn < \ref MAX_SYSTEM_EXCEPTION_NUM, it will be registered into SystemExceptionHandlers[EXCn-1].
+ * * For EXCn == NMI_EXCn, it will be registered into SystemExceptionHandlers[MAX_SYSTEM_EXCEPTION_NUM].
+ * \param   EXCn    See \ref EXCn_Type
+ * \param   exc_handler     The exception handler for this exception code EXCn
+ */
+void Exception_Register_EXC(uint32_t EXCn, unsigned long exc_handler)
+{
+    if ((EXCn < MAX_SYSTEM_EXCEPTION_NUM) && (EXCn >= 0)) {
+        SystemExceptionHandlers[EXCn] = exc_handler;
+    } else if (EXCn == NMI_EXCn) {
+        SystemExceptionHandlers[MAX_SYSTEM_EXCEPTION_NUM] = exc_handler;
+    }
+}
+
+/**
+ * \brief       Get current exception handler for exception code EXCn
+ * \details
+ * * For EXCn < \ref MAX_SYSTEM_EXCEPTION_NUM, it will return SystemExceptionHandlers[EXCn-1].
+ * * For EXCn == NMI_EXCn, it will return SystemExceptionHandlers[MAX_SYSTEM_EXCEPTION_NUM].
+ * \param   EXCn    See \ref EXCn_Type
+ * \return  Current exception handler for exception code EXCn, if not found, return 0.
+ */
+unsigned long Exception_Get_EXC(uint32_t EXCn)
+{
+    if ((EXCn < MAX_SYSTEM_EXCEPTION_NUM) && (EXCn >= 0)) {
+        return SystemExceptionHandlers[EXCn];
+    } else if (EXCn == NMI_EXCn) {
+        return SystemExceptionHandlers[MAX_SYSTEM_EXCEPTION_NUM];
+    } else {
+        return 0;
+    }
+}
+
+/**
+ * \brief      Common NMI and Exception handler entry
+ * \details
+ * This function provided a command entry for NMI and exception. Silicon Vendor could modify
+ * this template implementation according to requirement.
+ * \remarks
+ * - RISCV provided common entry for all types of exception. This is proposed code template
+ *   for exception entry function, Silicon Vendor could modify the implementation.
+ * - For the core_exception_handler template, we provided exception register function \ref Exception_Register_EXC
+ *   which can help developer to register your exception handler for specific exception number.
+ */
+uint32_t core_exception_handler(unsigned long mcause, unsigned long sp)
+{
+    uint32_t EXCn = (uint32_t)(mcause & 0X00000fff);
+    EXC_HANDLER exc_handler;
+
+    if ((EXCn < MAX_SYSTEM_EXCEPTION_NUM) && (EXCn >= 0)) {
+        exc_handler = (EXC_HANDLER)SystemExceptionHandlers[EXCn];
+    } else if (EXCn == NMI_EXCn) {
+        exc_handler = (EXC_HANDLER)SystemExceptionHandlers[MAX_SYSTEM_EXCEPTION_NUM];
+    } else {
+        exc_handler = (EXC_HANDLER)system_default_exception_handler;
+    }
+    if (exc_handler != NULL) {
+        exc_handler(mcause, sp);
+    }
+    return 0;
+}
+/** @} */ /* End of Doxygen Group NMSIS_Core_ExceptionAndNMI */
+
+/** Banner Print for Nuclei SDK */
+void SystemBannerPrint(void)
+{
+#if defined(NUCLEI_BANNER) && (NUCLEI_BANNER == 1)
+#ifndef DOWNLOAD_MODE
+#error DOWNLOAD_MODE is not defined via build system, please check!
+#endif
+    const char* download_modes[] = {"FLASHXIP", "FLASH", "ILM", "DDR"};
+    printf("Nuclei SDK Build Time: %s, %s\r\n", __DATE__, __TIME__);
+    printf("Download Mode: %s\r\n", download_modes[DOWNLOAD_MODE]);
+    printf("CPU Frequency %lu Hz\r\n", SystemCoreClock);
+#endif
+}
+
+/**
+ * \brief initialize eclic config
+ * \details
+ * ECLIC needs be initialized after boot up,
+ * Vendor could also change the initialization
+ * configuration.
+ */
+void ECLIC_Init(void)
+{
+    /* Global Configuration about MTH and NLBits.
+     * TODO: Please adapt it according to your system requirement.
+     * This function is called in _init function */
+    ECLIC_SetMth(0);
+    ECLIC_SetCfgNlbits(__ECLIC_INTCTLBITS);
+}
+
+/**
+ * \brief  Initialize a specific IRQ and register the handler
+ * \details
+ * This function set vector mode, trigger mode and polarity, interrupt level and priority,
+ * assign handler for specific IRQn.
+ * \param [in]  IRQn        NMI interrupt handler address
+ * \param [in]  shv         \ref ECLIC_NON_VECTOR_INTERRUPT means non-vector mode, and \ref ECLIC_VECTOR_INTERRUPT is vector mode
+ * \param [in]  trig_mode   see \ref ECLIC_TRIGGER_Type
+ * \param [in]  lvl         interupt level
+ * \param [in]  priority    interrupt priority
+ * \param [in]  handler     interrupt handler, if NULL, handler will not be installed
+ * \return       -1 means invalid input parameter. 0 means successful.
+ * \remarks
+ * - This function use to configure specific eclic interrupt and register its interrupt handler and enable its interrupt.
+ * - If the vector table is placed in read-only section(FLASHXIP mode), handler could not be installed
+ */
+int32_t ECLIC_Register_IRQ(IRQn_Type IRQn, uint8_t shv, ECLIC_TRIGGER_Type trig_mode, uint8_t lvl, uint8_t priority, void *handler)
+{
+    if ((IRQn > SOC_INT_MAX) || (shv > ECLIC_VECTOR_INTERRUPT) \
+        || (trig_mode > ECLIC_NEGTIVE_EDGE_TRIGGER )) {
+        return -1;
+    }
+
+    /* set interrupt vector mode */
+    ECLIC_SetShvIRQ(IRQn, shv);
+    /* set interrupt trigger mode and polarity */
+    ECLIC_SetTrigIRQ(IRQn, trig_mode);
+    /* set interrupt level */
+    ECLIC_SetLevelIRQ(IRQn, lvl);
+    /* set interrupt priority */
+    ECLIC_SetPriorityIRQ(IRQn, priority);
+    if (handler != NULL) {
+        /* set interrupt handler entry to vector table */
+        ECLIC_SetVector(IRQn, (rv_csr_t)handler);
+    }
+    /* enable interrupt */
+    ECLIC_EnableIRQ(IRQn);
+    return 0;
+}
+/** @} */ /* End of Doxygen Group NMSIS_Core_ExceptionAndNMI */
+
+/**
+ * \brief early init function before main
+ * \details
+ * This function is executed right before main function.
+ * For RISC-V gnu toolchain, _init function might not be called
+ * by __libc_init_array function, so we defined a new function
+ * to do initialization
+ */
+void _premain_init(void)
+{
+    /* TODO: Add your own initialization code here, called before main */
+    /* __ICACHE_PRESENT and __DCACHE_PRESENT are defined in hbird.h */
+#if defined(__ICACHE_PRESENT) && __ICACHE_PRESENT == 1
+    EnableICache();
+#endif
+#if defined(__DCACHE_PRESENT) && __DCACHE_PRESENT == 1
+    EnableDCache();
+#endif
+    SystemCoreClock = get_cpu_freq();
+    gpio_iof_config(GPIO, IOF0_UART0_MASK, IOF_SEL_0);
+    uart_init(SOC_DEBUG_UART, 115200);
+    /* Display banner after UART initialized */
+    SystemBannerPrint();
+    /* Initialize exception default handlers */
+    Exception_Init();
+    /* ECLIC initialization, mainly MTH and NLBIT */
+    ECLIC_Init();
+}
+
+/**
+ * \brief finish function after main
+ * \param [in]  status     status code return from main
+ * \details
+ * This function is executed right after main function.
+ * For RISC-V gnu toolchain, _fini function might not be called
+ * by __libc_fini_array function, so we defined a new function
+ * to do initialization
+ */
+void _postmain_fini(int status)
+{
+    /* TODO: Add your own finishing code here, called after main */
+#ifdef SIMULATION_XLSPIKE
+extern void xlspike_exit(int status);
+    xlspike_exit(status);
+#endif
+}
+
+/**
+ * \brief _init function called in __libc_init_array()
+ * \details
+ * This `__libc_init_array()` function is called during startup code,
+ * user need to implement this function, otherwise when link it will
+ * error init.c:(.text.__libc_init_array+0x26): undefined reference to `_init'
+ * \note
+ * Please use \ref _premain_init function now
+ */
+void _init(void)
+{
+    /* Don't put any code here, please use _premain_init now */
+}
+
+/**
+ * \brief _fini function called in __libc_fini_array()
+ * \details
+ * This `__libc_fini_array()` function is called when exit main.
+ * user need to implement this function, otherwise when link it will
+ * error fini.c:(.text.__libc_fini_array+0x28): undefined reference to `_fini'
+ * \note
+ * Please use \ref _postmain_fini function now
+ */
+void _fini(void)
+{
+    /* Don't put any code here, please use _postmain_fini now */
+}
+
+/** @} */ /* End of Doxygen Group NMSIS_Core_SystemAndClock */
diff --git a/platform/nuclei-hbird/platform.c b/platform/nuclei-hbird/platform.c
new file mode 100644
index 000000000..357d3934e
--- /dev/null
+++ b/platform/nuclei-hbird/platform.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015 Travis Geiselbrecht
+ *
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file or at
+ * https://opensource.org/licenses/MIT
+ */
+#include <lk/reg.h>
+#include <kernel/thread.h>
+#include <platform.h>
+#include <platform/interrupts.h>
+#include <platform/debug.h>
+#include <platform/timer.h>
+#include <sys/types.h>
+
+#include "platform_p.h"
+
+void platform_early_init(void) {
+    sifive_uart_early_init();
+}
+
+void platform_init(void) {
+    sifive_uart_init();
+}
+
+void platform_dputc(char c) {
+    if (c == '\n')
+        sifive_uart_write('\r');
+    sifive_uart_write(c);
+}
+
+int platform_dgetc(char *c, bool wait) {
+    return sifive_uart_read(c, wait);
+}
+
+
diff --git a/platform/nuclei-hbird/platform_p.h b/platform/nuclei-hbird/platform_p.h
new file mode 100644
index 000000000..881817613
--- /dev/null
+++ b/platform/nuclei-hbird/platform_p.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2015 Travis Geiselbrecht
+ *
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file or at
+ * https://opensource.org/licenses/MIT
+ */
+#pragma once
+
+#include <stdbool.h>
+
+void sifive_uart_write(int c);
+int sifive_uart_read(char *c, bool wait);
+void sifive_uart_early_init(void);
+void sifive_uart_init(void);
+
diff --git a/platform/nuclei-hbird/rules.mk b/platform/nuclei-hbird/rules.mk
new file mode 100644
index 000000000..c2b9a04eb
--- /dev/null
+++ b/platform/nuclei-hbird/rules.mk
@@ -0,0 +1,25 @@
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+MODULE := $(LOCAL_DIR)
+
+ARCH := riscv
+SUBARCH ?= 32
+VARIANT ?= nuclei
+
+MODULE_SRCS += $(LOCAL_DIR)/platform.c
+MODULE_SRCS += $(LOCAL_DIR)/uart.c
+MODULE_SRCS += $(LOCAL_DIR)/vectab.c
+
+ROMBASE ?= 0x80000000 # if running from rom, start here
+MEMBASE ?= 0x90000000
+MEMSIZE ?= 0x00010000 # default to 1MB
+
+# uses a two segment layout, select the appropriate linker script
+ARCH_RISCV_TWOSEGMENT := 1
+# sets a few options in the riscv arch
+ARCH_RISCV_EMBEDDED := 1
+
+MODULE_DEPS += platform/hbird \
+	lib/cbuf
+
+include make/module.mk
diff --git a/platform/nuclei-hbird/uart.c b/platform/nuclei-hbird/uart.c
new file mode 100644
index 000000000..c46fe82ff
--- /dev/null
+++ b/platform/nuclei-hbird/uart.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018 Travis Geiselbrecht
+ *
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file or at
+ * https://opensource.org/licenses/MIT
+ */
+#include "nuclei_sdk_hal.h"
+
+#include <lk/reg.h>
+#include <lk/trace.h>
+#include <lib/cbuf.h>
+#include <kernel/thread.h>
+#include <platform.h>
+#include <platform/interrupts.h>
+#include <sys/types.h>
+
+#include "platform_p.h"
+
+#define LOCAL_TRACE 0
+#define POLLING_RX 1
+
+// static volatile unsigned int *const uart_base = (unsigned int *)0;
+
+#define UART_TXDATA 0
+#define UART_RXDATA 1
+#define UART_TXCTRL 2
+#define UART_RXCTRL 3
+#define UART_IE     4
+#define UART_IP     5
+#define UART_DIV    6
+
+#define RXBUF_SIZE 128
+static char uart_rx_buf_data[RXBUF_SIZE];
+static struct cbuf uart_rx_buf;
+
+void sifive_uart_write(int c) {
+    uart_write(SOC_DEBUG_UART, c);
+}
+
+int sifive_uart_read(char *c, bool wait) {
+#if !POLLING_RX
+    if (cbuf_read_char(&uart_rx_buf, c, wait) == 1) {
+        return 0;
+    }
+    return -1;
+#else
+    // full polling mode for reads
+    int i = SOC_DEBUG_UART->RXFIFO;
+    if (i & (1<<31))
+        return -1;
+    *c = i & 0xff;
+    return 0;
+#endif
+}
+
+static enum handler_return sifive_uart_irq(void *unused) {
+    LTRACE;
+
+    enum handler_return ret = INT_NO_RESCHEDULE;
+    for (;;) {
+        int c = SOC_DEBUG_UART->RXFIFO;
+        if (c & (1<<31))
+            break; // nothing in the fifo
+
+        // stuff this char in the cbuf and try again
+        cbuf_write_char(&uart_rx_buf, c & 0xff, false);
+        ret = INT_RESCHEDULE;
+    }
+
+    return ret;
+}
+
+void sifive_uart_early_init(void) {
+    // uart_base[UART_DIV] = SOC_FREQ / 115200;
+    // uart_base[UART_TXCTRL] = 1; // txen
+    gpio_iof_config(GPIO, IOF0_UART0_MASK, IOF_SEL_0);
+    uart_init(SOC_DEBUG_UART, 115200);
+}
+
+void sifive_uart_init(void) {
+    cbuf_initialize_etc(&uart_rx_buf, RXBUF_SIZE, uart_rx_buf_data);
+
+    // uart_enable_rxint();
+    // ECLIC_Register_IRQ();
+//     uart_base[UART_RXCTRL] = 1; // rxen, rxcnt = 0
+//     uart_base[UART_IE] |= (1<<1); // rxwvm
+
+// #if !POLLING_RX
+//     unmask_interrupt(SIFIVE_IRQ_UART0);
+// #endif
+}
+
diff --git a/platform/nuclei-hbird/vectab.c b/platform/nuclei-hbird/vectab.c
new file mode 100644
index 000000000..615f41341
--- /dev/null
+++ b/platform/nuclei-hbird/vectab.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2012 Travis Geiselbrecht
+ *
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file or at
+ * https://opensource.org/licenses/MIT
+ */
+#include <hbird.h>
+
+#include <lk/compiler.h>
+#include <lk/debug.h>
+
+/* un-overridden irq handler */
+void hbird_dummy_irq(void) {
+    panic("unhandled irq\n");
+}
+
+/* a list of default handlers that are simply aliases to the dummy handler */
+#define DEFAULT_HANDLER(x) \
+void hbird_irq##x##_handler(void) __WEAK_ALIAS("hbird_dummy_irq");
+
+DEFAULT_HANDLER(19);
+DEFAULT_HANDLER(20);
+DEFAULT_HANDLER(21);
+DEFAULT_HANDLER(22);
+DEFAULT_HANDLER(23);
+DEFAULT_HANDLER(24);
+DEFAULT_HANDLER(25);
+DEFAULT_HANDLER(26);
+DEFAULT_HANDLER(27);
+DEFAULT_HANDLER(28);
+DEFAULT_HANDLER(29);
+DEFAULT_HANDLER(30);
+DEFAULT_HANDLER(31);
+DEFAULT_HANDLER(32);
+DEFAULT_HANDLER(33);
+DEFAULT_HANDLER(34);
+DEFAULT_HANDLER(35);
+DEFAULT_HANDLER(36);
+DEFAULT_HANDLER(37);
+DEFAULT_HANDLER(38);
+DEFAULT_HANDLER(39);
+DEFAULT_HANDLER(40);
+DEFAULT_HANDLER(41);
+DEFAULT_HANDLER(42);
+DEFAULT_HANDLER(43);
+DEFAULT_HANDLER(44);
+DEFAULT_HANDLER(45);
+DEFAULT_HANDLER(46);
+DEFAULT_HANDLER(47);
+DEFAULT_HANDLER(48);
+DEFAULT_HANDLER(49);
+DEFAULT_HANDLER(50);
+
+#define VECTAB_ENTRY(x) [SOC_INT##x##_IRQn] = hbird_irq##x##_handler
+
+/*
+ * Appended to the end of the nuclei riscv vector table.
+ *
+ */
+const void *const __SECTION(".text.boot.vectab2") vectab2[] = {
+    VECTAB_ENTRY(19),
+    VECTAB_ENTRY(20),
+    VECTAB_ENTRY(21),
+    VECTAB_ENTRY(22),
+    VECTAB_ENTRY(23),
+    VECTAB_ENTRY(24),
+    VECTAB_ENTRY(25),
+    VECTAB_ENTRY(26),
+    VECTAB_ENTRY(27),
+    VECTAB_ENTRY(28),
+    VECTAB_ENTRY(29),
+    VECTAB_ENTRY(30),
+    VECTAB_ENTRY(31),
+    VECTAB_ENTRY(32),
+    VECTAB_ENTRY(33),
+    VECTAB_ENTRY(34),
+    VECTAB_ENTRY(35),
+    VECTAB_ENTRY(36),
+    VECTAB_ENTRY(37),
+    VECTAB_ENTRY(38),
+    VECTAB_ENTRY(39),
+    VECTAB_ENTRY(40),
+    VECTAB_ENTRY(41),
+    VECTAB_ENTRY(42),
+    VECTAB_ENTRY(43),
+    VECTAB_ENTRY(44),
+    VECTAB_ENTRY(45),
+    VECTAB_ENTRY(46),
+    VECTAB_ENTRY(47),
+    VECTAB_ENTRY(48),
+    VECTAB_ENTRY(49),
+    VECTAB_ENTRY(50),
+};
diff --git a/project/nuclei-hbird.mk b/project/nuclei-hbird.mk
new file mode 100644
index 000000000..76ac12fda
--- /dev/null
+++ b/project/nuclei-hbird.mk
@@ -0,0 +1,8 @@
+# main project for qemu-riscv32
+MODULES += \
+	app/shell \
+	app/tests
+SUBARCH := 32
+
+include project/target/nuclei-hbird.mk
+
diff --git a/project/target/nuclei-hbird.mk b/project/target/nuclei-hbird.mk
new file mode 100644
index 000000000..b6d5ecdce
--- /dev/null
+++ b/project/target/nuclei-hbird.mk
@@ -0,0 +1,2 @@
+TARGET := nuclei-hbird
+
diff --git a/target/nuclei-hbird/include/board_hbird_eval.h b/target/nuclei-hbird/include/board_hbird_eval.h
new file mode 100644
index 000000000..0c3840e84
--- /dev/null
+++ b/target/nuclei-hbird/include/board_hbird_eval.h
@@ -0,0 +1,37 @@
+// See LICENSE for license details.
+#ifndef _BOARD_HBIRD_EVAL_H_
+#define _BOARD_HBIRD_EVAL_H_
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#include "nuclei_sdk_soc.h"
+
+
+// Interrupt Numbers
+#define SOC_BUTTON_1_IRQn           SOC_INT49_IRQn
+#define SOC_BUTTON_2_IRQn           SOC_INT50_IRQn
+// Interrupt Handler Definitions
+#define SOC_BUTTON_1_HANDLER        eclic_irq49_handler
+#define SOC_BUTTON_2_HANDLER        eclic_irq50_handler
+// GPIO Bit Offset
+#define SOC_LED_RED_GPIO_OFS        19
+#define SOC_LED_GREEN_GPIO_OFS      21
+#define SOC_LED_BLUE_GPIO_OFS       22
+#define SOC_BUTTON_1_GPIO_OFS       30
+#define SOC_BUTTON_2_GPIO_OFS       31
+
+// GPIO Bit Mask
+#define SOC_LED_RED_GPIO_MASK       (1<<SOC_LED_RED_GPIO_OFS)
+#define SOC_LED_GREEN_GPIO_MASK     (1<<SOC_LED_GREEN_GPIO_OFS)
+#define SOC_LED_BLUE_GPIO_MASK      (1<<SOC_LED_BLUE_GPIO_OFS)
+#define SOC_BUTTON_1_GPIO_MASK      (1<<SOC_BUTTON_1_GPIO_OFS)
+#define SOC_BUTTON_2_GPIO_MASK      (1<<SOC_BUTTON_2_GPIO_OFS)
+#define SOC_BUTTON_GPIO_MASK        (SOC_BUTTON_1_GPIO_MASK | SOC_BUTTON_2_GPIO_MASK)
+#define SOC_LED_GPIO_MASK           (SOC_LED_RED_GPIO_MASK | SOC_LED_GREEN_GPIO_MASK | SOC_LED_BLUE_GPIO_MASK)
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/target/nuclei-hbird/include/nuclei_sdk_hal.h b/target/nuclei-hbird/include/nuclei_sdk_hal.h
new file mode 100644
index 000000000..41e03b6b4
--- /dev/null
+++ b/target/nuclei-hbird/include/nuclei_sdk_hal.h
@@ -0,0 +1,20 @@
+// See LICENSE for license details.
+#ifndef _NUCLEI_SDK_HAL_H
+#define _NUCLEI_SDK_HAL_H
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#include "board_hbird_eval.h"
+
+#define SOC_DEBUG_UART      UART0
+
+#ifndef NUCLEI_BANNER
+#define NUCLEI_BANNER       1
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/target/nuclei-hbird/include/platform/nuclei-hbird.h b/target/nuclei-hbird/include/platform/nuclei-hbird.h
new file mode 100644
index 000000000..edaf65ce7
--- /dev/null
+++ b/target/nuclei-hbird/include/platform/nuclei-hbird.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018 Travis Geiselbrecht
+ *
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file or at
+ * https://opensource.org/licenses/MIT
+ */
+#pragma once
+
+#define SIFIVE_IRQ_UART0 3
+#define SIFIVE_IRQ_UART1 4
+
+#define SIFIVE_NUM_IRQS 127
+
+#define CLINT_BASE 0x02000000
+#define PLIC_BASE  0x0c000000
+#define PRCI_BASE  0x10008000
+#define GPIO_BASE  0x10012000
+#define UART0_BASE 0x10013000
+
+#define GPIO_REG_VALUE      0
+#define GPIO_REG_INPUT_EN   1
+#define GPIO_REG_OUTPUT_EN  2
+#define GPIO_REG_PORT       3
+#define GPIO_REG_IOF_EN     14
+#define GPIO_REG_IOF_SEL    15
+
+#define PLIC_HART_IDX(hart)    0
diff --git a/target/nuclei-hbird/openocd_hbird.cfg b/target/nuclei-hbird/openocd_hbird.cfg
new file mode 100644
index 000000000..e5a2ac7af
--- /dev/null
+++ b/target/nuclei-hbird/openocd_hbird.cfg
@@ -0,0 +1,50 @@
+adapter_khz     1000
+
+interface ftdi
+ftdi_vid_pid 0x0403 0x6010
+ftdi_oscan1_mode off
+
+## If ftdi_device_desc not specified, the device description is ignored during device selection.
+## So if you want to specify a dedicated FTDI device, you can select following device description:
+## "Dual RS232-HS" is for HummingBird Debugger V1
+## "USB <-> JTAG-DEBUGGER" is for HummingBird Debugger V2
+## Uncomment one which match your device description
+# ftdi_device_desc "Dual RS232-HS"
+# ftdi_device_desc "USB <-> JTAG-DEBUGGER"
+
+transport select jtag
+
+ftdi_layout_init 0x0008 0x001b
+ftdi_layout_signal nSRST -oe 0x0020 -data 0x0020
+ftdi_layout_signal TCK -data 0x0001
+ftdi_layout_signal TDI -data 0x0002
+ftdi_layout_signal TDO -input 0x0004
+ftdi_layout_signal TMS -data 0x0008
+ftdi_layout_signal JTAG_SEL -data 0x0100 -oe 0x0100
+
+set _CHIPNAME riscv
+jtag newtap $_CHIPNAME cpu -irlen 5
+
+set _TARGETNAME $_CHIPNAME.cpu
+target create $_TARGETNAME riscv -chain-position $_TARGETNAME
+$_TARGETNAME configure -work-area-phys 0x80000000 -work-area-size 10000 -work-area-backup 1
+
+set _FLASHNAME $_CHIPNAME.flash
+flash bank $_FLASHNAME fespi 0x20000000 0 0 0 $_TARGETNAME
+# Set the ILM space also as flash, to make sure it can be add breakpoint with hardware trigger
+#flash bank onboard_ilm fespi 0x80000000 0 0 0 $_TARGETNAME
+
+# Expose Nuclei self-defined CSRS
+# See https://github.com/riscv/riscv-gnu-toolchain/issues/319#issuecomment-358397306
+# Then user can view the csr register value in gdb using: info reg csr775 for CSR MTVT(0x307)
+riscv expose_csrs 416-496,770-800,835-850,1227-1231,1483-1486,1984-2032,2064-2070,2370-2380,2490-2500,4032-4040
+
+init
+
+if {[ info exists pulse_srst]} {
+  ftdi_set_signal nSRST 0
+  ftdi_set_signal nSRST z
+}
+halt
+# We must turn on this because otherwise the IDE version debug cannot download the program into flash
+flash protect 0 0 last off
diff --git a/target/nuclei-hbird/rules.mk b/target/nuclei-hbird/rules.mk
new file mode 100644
index 000000000..aaa5a401e
--- /dev/null
+++ b/target/nuclei-hbird/rules.mk
@@ -0,0 +1,34 @@
+LOCAL_DIR := $(GET_LOCAL_DIR)
+
+MODULE := $(LOCAL_DIR)
+
+PLATFORM := nuclei-hbird
+WITH_LINKER_GC ?= 1
+
+MEMSIZE ?= 0x10000     # 64KB
+GLOBAL_DEFINES += TARGET_HAS_DEBUG_LED=1
+
+# target code will set the master frequency to 16Mhz
+GLOBAL_DEFINES += SOC_FREQ=32000000 DOWNLOAD_MODE=DOWNLOAD_MODE_ILM
+
+MODULE_SRCS := $(LOCAL_DIR)/target.c
+
+# set some global defines based on capability
+GLOBAL_DEFINES += CONSOLE_ENABLE_HISTORY=0
+GLOBAL_DEFINES += PLATFORM_HAS_DYNAMIC_TIMER=1
+GLOBAL_DEFINES += ARCH_RISCV_CLINT_BASE=0x02000000
+GLOBAL_DEFINES += ARCH_RISCV_MTIME_RATE=32768
+
+OPENOCD_CFG := $(LOCAL_DIR)/openocd_hbird.cfg
+
+OPENOCD_ARGS += -f $(OPENOCD_CFG)
+
+GDB_UPLOAD_ARGS ?= --batch
+GDB_UPLOAD_CMDS += -ex "monitor halt"
+GDB_UPLOAD_CMDS += -ex "monitor flash protect 0 0 last off"
+GDB_UPLOAD_CMDS += -ex "load"
+GDB_UPLOAD_CMDS += -ex "monitor resume"
+GDB_UPLOAD_CMDS += -ex "quit" 
+
+include make/module.mk
+
diff --git a/target/nuclei-hbird/target.c b/target/nuclei-hbird/target.c
new file mode 100644
index 000000000..522b9d055
--- /dev/null
+++ b/target/nuclei-hbird/target.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2019 Travis Geiselbrecht
+ *
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file or at
+ * https://opensource.org/licenses/MIT
+ */
+
+#include <target.h>
+#include <arch/arch_ops.h>
+#include <platform/nuclei-hbird.h>
+
+void target_early_init(void) {
+    
+}
+
+void target_set_debug_led(unsigned int led, bool on) {
+    
+}
+
+void target_init(void) {
+}
+
+

From 2e2678a208030aa967f776cdfc61ea565f9dadfb Mon Sep 17 00:00:00 2001
From: Huaqi Fang <578567190@qq.com>
Date: Wed, 28 Oct 2020 13:05:58 +0800
Subject: [PATCH 2/4] [arch][riscv] remove debug message in idle

Signed-off-by: Huaqi Fang <578567190@qq.com>
---
 arch/riscv/arch.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/riscv/arch.c b/arch/riscv/arch.c
index 913ff2ec1..67892a029 100644
--- a/arch/riscv/arch.c
+++ b/arch/riscv/arch.c
@@ -143,7 +143,6 @@ void arch_init(void) {
 void arch_idle(void) {
     // let the platform/target disable wfi
 #if !RISCV_DISABLE_WFI
-    printf("Idle\n");
     __asm__ volatile("wfi");
 #endif
 }

From 3972af29f95e24010cbe8a7512afa6360cd7cec5 Mon Sep 17 00:00:00 2001
From: Huaqi Fang <578567190@qq.com>
Date: Wed, 28 Oct 2020 13:11:21 +0800
Subject: [PATCH 3/4] [ci][travis] Add travis ci build for nuclei-hbird

Signed-off-by: Huaqi Fang <578567190@qq.com>
---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 758f5d54a..7bea976a7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,6 +22,7 @@ env:
     - PROJECT=qemu-virt-riscv64-supervisor-test    TOOLCHAIN=riscv64-elf-7.5.0-Linux-x86_64
     - PROJECT=sifive-e-test             TOOLCHAIN=riscv32-elf-7.5.0-Linux-x86_64
     - PROJECT=sifive-unleashed-test     TOOLCHAIN=riscv64-elf-7.5.0-Linux-x86_64
+    - PROJECT=nuclei-hbird              TOOLCHAIN=riscv32-elf-7.5.0-Linux-x86_64
     - PROJECT=pc-x86-test               TOOLCHAIN=i386-elf-7.5.0-Linux-x86_64
     - PROJECT=pc-x86-64-test            TOOLCHAIN=x86_64-elf-7.5.0-Linux-x86_64
 

From 52c107216c89999ab5a50f4987f6aef43bdf632a Mon Sep 17 00:00:00 2001
From: Huaqi Fang <578567190@qq.com>
Date: Mon, 2 Nov 2020 15:03:57 +0800
Subject: [PATCH 4/4] arch: remove libg++ dependency check

Signed-off-by: Huaqi Fang <578567190@qq.com>
---
 arch/riscv/rules.mk | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/riscv/rules.mk b/arch/riscv/rules.mk
index 3f93eebde..f755c0a1a 100644
--- a/arch/riscv/rules.mk
+++ b/arch/riscv/rules.mk
@@ -166,7 +166,6 @@ WITH_LINKER_GC ?= 0
 endif
 
 LIBGCC := $(shell $(TOOLCHAIN_PREFIX)gcc $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(GLOBAL_CFLAGS) -print-libgcc-file-name)
-LIBGCC += $(shell $(TOOLCHAIN_PREFIX)gcc $(GLOBAL_COMPILEFLAGS) $(ARCH_COMPILEFLAGS) $(GLOBAL_CFLAGS) -print-file-name=libstdc++.a)
 $(info LIBGCC = $(LIBGCC))
 
 # potentially generated files that should be cleaned out with clean make rule