From 0b19ad00e68dc935446c1475f1996f6a99e9495b Mon Sep 17 00:00:00 2001 From: stnolting Date: Fri, 10 Jan 2025 21:06:45 +0100 Subject: [PATCH 1/6] [control] separate fence and fence.i instructions --- rtl/core/neorv32_cpu_control.vhd | 7 +++++-- rtl/core/neorv32_package.vhd | 19 ++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/rtl/core/neorv32_cpu_control.vhd b/rtl/core/neorv32_cpu_control.vhd index 586da6d80..9c76e88d0 100644 --- a/rtl/core/neorv32_cpu_control.vhd +++ b/rtl/core/neorv32_cpu_control.vhd @@ -372,7 +372,7 @@ begin ibus_req_o.src <= '1'; -- source = instruction fetch ibus_req_o.amo <= '0'; -- cannot be an atomic memory operation ibus_req_o.amoop <= (others => '0'); -- cannot be an atomic memory operation - ibus_req_o.fence <= ctrl.lsu_fence; -- fence operation, valid without STB being set + ibus_req_o.fence <= ctrl.if_fence; -- fence operation, valid without STB being set ibus_req_o.sleep <= sleep_mode; -- sleep mode, valid without STB being set ibus_req_o.debug <= debug_ctrl.run; -- debug mode, valid without STB being set @@ -753,7 +753,8 @@ begin -- memory fence operations (execute even if illegal funct3) -- when opcode_fence_c => - ctrl_nxt.lsu_fence <= '1'; -- [NOTE] fence == fence.i; ignore all ordering bits + ctrl_nxt.if_fence <= exe_engine.ir(instr_funct3_lsb_c); -- fence + ctrl_nxt.lsu_fence <= not exe_engine.ir(instr_funct3_lsb_c); -- fence.i exe_engine_nxt.state <= EX_RESTART; -- reset instruction fetch + IPB (actually only required for fence.i) -- FPU: floating-point operations -- @@ -853,6 +854,8 @@ begin -- CPU Control Bus Output ----------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- + -- instruction fetch -- + ctrl_o.if_fence <= ctrl.if_fence; -- register file -- ctrl_o.rf_wb_en <= ctrl.rf_wb_en and (not trap_ctrl.exc_fire); -- inhibit write-back if exception ctrl_o.rf_rs1 <= exe_engine.ir(instr_rs1_msb_c downto instr_rs1_lsb_c); diff --git a/rtl/core/neorv32_package.vhd b/rtl/core/neorv32_package.vhd index 4c29eb752..97c93f21f 100644 --- a/rtl/core/neorv32_package.vhd +++ b/rtl/core/neorv32_package.vhd @@ -29,7 +29,7 @@ package neorv32_package is -- Architecture Constants ----------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- - constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01100902"; -- hardware version + constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01100903"; -- hardware version constant archid_c : natural := 19; -- official RISC-V architecture ID constant XLEN : natural := 32; -- native data path width @@ -335,11 +335,11 @@ package neorv32_package is -- RISC-V Funct12 - SYSTEM ---------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- - constant funct12_ecall_c : std_ulogic_vector(11 downto 0) := x"000"; -- ecall - constant funct12_ebreak_c : std_ulogic_vector(11 downto 0) := x"001"; -- ebreak - constant funct12_wfi_c : std_ulogic_vector(11 downto 0) := x"105"; -- wfi - constant funct12_mret_c : std_ulogic_vector(11 downto 0) := x"302"; -- mret - constant funct12_dret_c : std_ulogic_vector(11 downto 0) := x"7b2"; -- dret + constant funct12_ecall_c : std_ulogic_vector(11 downto 0) := x"000"; + constant funct12_ebreak_c : std_ulogic_vector(11 downto 0) := x"001"; + constant funct12_wfi_c : std_ulogic_vector(11 downto 0) := x"105"; + constant funct12_mret_c : std_ulogic_vector(11 downto 0) := x"302"; + constant funct12_dret_c : std_ulogic_vector(11 downto 0) := x"7b2"; -- RISC-V Floating-Point Stuff ------------------------------------------------------------ -- ------------------------------------------------------------------------------------------- @@ -463,7 +463,6 @@ package neorv32_package is constant csr_mhpmcounter13_c : std_ulogic_vector(11 downto 0) := x"b0d"; constant csr_mhpmcounter14_c : std_ulogic_vector(11 downto 0) := x"b0e"; constant csr_mhpmcounter15_c : std_ulogic_vector(11 downto 0) := x"b0f"; - -- constant csr_mcycleh_c : std_ulogic_vector(11 downto 0) := x"b80"; --constant csr_mtimeh_c : std_ulogic_vector(11 downto 0) := x"b81"; constant csr_minstreth_c : std_ulogic_vector(11 downto 0) := x"b82"; @@ -487,7 +486,6 @@ package neorv32_package is constant csr_cycle_c : std_ulogic_vector(11 downto 0) := x"c00"; --constant csr_time_c : std_ulogic_vector(11 downto 0) := x"c01"; constant csr_instret_c : std_ulogic_vector(11 downto 0) := x"c02"; - -- constant csr_cycleh_c : std_ulogic_vector(11 downto 0) := x"c80"; --constant csr_timeh_c : std_ulogic_vector(11 downto 0) := x"c81"; constant csr_instreth_c : std_ulogic_vector(11 downto 0) := x"c82"; @@ -507,6 +505,8 @@ package neorv32_package is -- Main CPU Control Bus ------------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- type ctrl_bus_t is record + -- instruction fetch -- + if_fence : std_ulogic; -- fence.i operation -- register file -- rf_wb_en : std_ulogic; -- write back enable rf_rs1 : std_ulogic_vector(4 downto 0); -- source register 1 address @@ -526,7 +526,7 @@ package neorv32_package is lsu_req : std_ulogic; -- trigger memory access request lsu_rw : std_ulogic; -- 0: read access, 1: write access lsu_mo_we : std_ulogic; -- memory address and data output register write enable - lsu_fence : std_ulogic; -- fence(.i) operation + lsu_fence : std_ulogic; -- fence operation lsu_priv : std_ulogic; -- effective privilege mode for load/store -- instruction word -- ir_funct3 : std_ulogic_vector(2 downto 0); -- funct3 bit field @@ -541,6 +541,7 @@ package neorv32_package is -- control bus reset initializer -- constant ctrl_bus_zero_c : ctrl_bus_t := ( + if_fence => '0', rf_wb_en => '0', rf_rs1 => (others => '0'), rf_rs2 => (others => '0'), From 1c55739b6d595d1af932dd66e7929959f15c2694 Mon Sep 17 00:00:00 2001 From: stnolting Date: Fri, 10 Jan 2025 21:18:41 +0100 Subject: [PATCH 2/6] [changelog] add v1.10.9.3 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf271fc48..49118ecb3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ mimpid = 0x01040312 -> Version 01.04.03.12 -> v1.4.3.12 | Date | Version | Comment | Ticket | |:----:|:-------:|:--------|:------:| +| 10.01.2025 | 1.10.9.3 | split functional behavior of `fence` and `fence.i` instructions | [#1149](https://github.com/stnolting/neorv32/pull/1149) | | 10.01.2025 | 1.10.9.2 | clean-up SMP dual-core configuration (HW and SW optimizations) | [#1146](https://github.com/stnolting/neorv32/pull/1146) | | 09.01.2025 | 1.10.9.1 | fix side-effects of CSR read instructions | [#1145](https://github.com/stnolting/neorv32/pull/1145) | | 08.01.2025 | [**:rocket:1.10.9**](https://github.com/stnolting/neorv32/releases/tag/v1.10.9) | **New release** | | From 65c6ffa883d4f19a49c1b6f8293c43f7fbdc0d8f Mon Sep 17 00:00:00 2001 From: stnolting Date: Fri, 10 Jan 2025 21:19:04 +0100 Subject: [PATCH 3/6] [docs] add new section: cache coherency --- docs/datasheet/soc.adoc | 88 ++++++++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 24 deletions(-) diff --git a/docs/datasheet/soc.adoc b/docs/datasheet/soc.adoc index 93b31afe2..4fe53863c 100644 --- a/docs/datasheet/soc.adoc +++ b/docs/datasheet/soc.adoc @@ -575,6 +575,25 @@ Accesses that are delegated to the external bus interface have a different maxim explicit specific processor generic. See section <<_processor_external_bus_interface_xbus>> for more information. +:sectnums: +==== IO Switch + +The IO switch further decodes the address when accessing the processor-internal IO/peripheral devices and forwards +the access request to the according module. Note that a total address space size of 256 bytes is assigned to each +IO module in order to simplify address decoding. The IO-specific address map is also defined in the main VHDL +package file (`rtl/core/neorv323_package.vhd`). + +.Exemplary Cut-Out from the IO Address Map +[source,vhdl] +---- +-- IO Address Map -- +constant iodev_size_c : natural := 256; -- size of a single IO device (bytes) +constant base_io_cfs_c : std_ulogic_vector(31 downto 0) := x"ffffeb00"; +constant base_io_slink_c : std_ulogic_vector(31 downto 0) := x"ffffec00"; +constant base_io_dma_c : std_ulogic_vector(31 downto 0) := x"ffffed00"; +---- + + :sectnums: ==== Atomic Memory Operations Controller @@ -595,41 +614,62 @@ written to the addressed memory cell. In parallel, the data from the first buffe content of the addresses memory cell) is sent back to the requesting CPU. |======================= -The controller performs two bus transactions: a read operations and a write operation. Only the acknowledge/error -handshake of the last transaction is sent back to the CPU. - -As the AMO controller is the memory-nearest instance (see <<_bus_system>>) the previously described set of operations -cannot be interrupted. Hence, they execute in an atomic way. +.Direct Access +[IMPORTANT] +Atomic operations **always bypass** the CPU's <<_processor_internal_data_cache_dcache, data cache>> +using direct/uncached accesses. Care must be taken to maintain data <<_cache_coherency>>. .Physical Memory Attributes [NOTE] Atomic memory operations can be executed for _any_ address. This also includes cached memory, memory-mapped IO devices and processor-external address spaces. -.Cache Coherency -[IMPORTANT] -Atomic operations **always bypass** the CPU's <<_processor_internal_data_cache_dcache, data cache>> -using direct/uncached accesses. Care must be taken to maintain data cache coherency when accessing -cached memory (e.g. by using the `fence` instruction). +The controller performs two bus transactions: a read operations and a write operation. Only the acknowledge/error +handshake of the last transaction is sent back to the CPU. + +As the AMO controller is the memory-nearest instance (see <<_bus_system>>) the previously described set of operations +cannot be interrupted. Hence, they execute in an atomic way. :sectnums: -==== IO Switch +==== Cache Coherency -The IO switch further decodes the address when accessing the processor-internal IO/peripheral devices and forwards -the access request to the according module. Note that a total address space size of 256 bytes is assigned to each -IO module in order to simplify address decoding. The IO-specific address map is also defined in the main VHDL -package file (`rtl/core/neorv323_package.vhd`). +In total the NEORV32 Processor provides up to four optional caches organized in two levels. Level-1 +caches are closer to the CPU while level-2 caches are closer to main memory (however, this highly depends +on the the actual cache configurations). -.Exemplary Cut-Out from the IO Address Map -[source,vhdl] ----- --- IO Address Map -- -constant iodev_size_c : natural := 256; -- size of a single IO device (bytes) -constant base_io_cfs_c : std_ulogic_vector(31 downto 0) := x"ffffeb00"; -constant base_io_slink_c : std_ulogic_vector(31 downto 0) := x"ffffec00"; -constant base_io_dma_c : std_ulogic_vector(31 downto 0) := x"ffffed00"; ----- +* The <<_processor_internal_data_cache_dcache>> (level-1) +* The <<_processor_internal_instruction_cache_icache>> (level-1) +* The cache of the <<_processor_external_bus_interface_xbus>> (level-2) +* The cache of the <<_execute_in_place_module_xip>> (level-2) + +As all caches operate transparently for the software, special attention must therefore be paid to coherence. +Note that coherence and cache _synchronization_ is **not** performed by the hardware itself (there is no +snooping implemented). + +The NEORV32 uses two instructions for manual cache synchronization (both instructions are always available +regardless of the actual CPU/ISA configuration): + +* `fence` (<<_i_isa_extension>> / <<_e_isa_extension>>) +* `fence.i` (<<_zifencei_isa_extension>>) + +By executing the "data" `fence` instruction the CPU's data cache is synchronized in four steps: + +[start=1] +. The CPU data cache is flushed: all local modifications are copied to the next higher memory level; +this can be the XBUS cache or main memory. +. The CPU data cache is cleared invalidating all local entries. +. The synchronization request is sent to the next-higher memory level (for example to the XBUS cache +so it can perform the same synchronization steps). +. The CPU data cache is reloaded with up-to-date data from the next higher memory level. + +By executing the "instruction" `fence.i` instruction the CPU's instruction cache is synchronized in three steps: + +[start=1] +. The synchronization request is sent to the next-higher memory level (for example to the XBUS cache +so it can perform the same synchronization steps). +. The CPU instruction cache is cleared invalidating all local entries. +. The CPU instruction cache is reloaded with up-to-date data from the next higher memory level. <<< From fb102bf3c589c110204db349fcf64d4db795c3b7 Mon Sep 17 00:00:00 2001 From: stnolting Date: Fri, 10 Jan 2025 21:19:25 +0100 Subject: [PATCH 4/6] [docs] minor edits and updates --- docs/datasheet/cpu.adoc | 18 ++++++++---------- docs/datasheet/soc_dcache.adoc | 3 ++- docs/datasheet/soc_icache.adoc | 3 ++- docs/datasheet/soc_xbus.adoc | 6 ++++++ docs/datasheet/soc_xip.adoc | 2 +- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/docs/datasheet/cpu.adoc b/docs/datasheet/cpu.adoc index 6a93c501b..b9838bb7e 100644 --- a/docs/datasheet/cpu.adoc +++ b/docs/datasheet/cpu.adoc @@ -503,7 +503,7 @@ operation: .Cache Coherency [IMPORTANT] Atomic operations **always bypass** the CPU caches using direct/uncached accesses. Care must be taken -to maintain data cache coherency (e.g. by using the `fence` instruction). +to maintain data <<_cache_coherency>>. <<< @@ -641,10 +641,11 @@ The `I` ISA extensions is the base RISC-V integer ISA that is always enabled. .`fence` Instruction [NOTE] -The `fence` instruction word's _predecessor_ and _successor_ bits (used for memory ordering) are not evaluated -by the hardware at all. For the NEORV32 the `fence` instruction behaves exactly like the `fence.i` instruction -(see <<_zifencei_isa_extension>>). However, software should still use distinct `fence` and `fence.i` to provide -platform-compatibility and to indicate the actual intention of the according fence instruction(s). +Analogous to the `fence.i` instruction (<<_zifencei_isa_extension>>) the `fence` instruction triggers +a data cache synchronization operation. See section <<_cache_coherency>> for more information. +Furthermore, the `fence` instruction word's _predecessor_ and _successor_ bits (used for memory ordering) +are not evaluated by the hardware at all. + .`wfi` Instruction [NOTE] @@ -717,11 +718,8 @@ The instruction word's `aq` and `lr` memory ordering bits are not evaluated by t The `Zifencei` CPU extension allows manual synchronization of the instruction stream. This extension is always enabled. -.NEORV32 Fence Instructions -[NOTE] -The NEORV32 treats both fence instructions (`fence` = data fence, `fence.i` = instruction fence) in exactly the same way. -Both instructions cause a flush of the CPU's instruction prefetch buffer and also send a fence request via the system -bus (see <<_bus_interface>>). This system bus fence operation will, for example, clear/flush all downstream caches. +Analogous to the `fence` instruction the `fence.i` instruction triggers an instruction cache synchronization operation. +See section <<_cache_coherency>> for more information. .Instructions and Timing [cols="<2,<4,<3"] diff --git a/docs/datasheet/soc_dcache.adoc b/docs/datasheet/soc_dcache.adoc index 44f317df2..7256cccdb 100644 --- a/docs/datasheet/soc_dcache.adoc +++ b/docs/datasheet/soc_dcache.adoc @@ -37,7 +37,8 @@ The CPU cache(s) should not be implemented when using only processor-internal da .Manual Cache Flush/Clear/Reload [NOTE] -By executing the `fence(.i)` instruction the cache is flushed, cleared and a reload from main memory is triggered. +By executing the `fence` instruction the data cache is flushed, cleared and reloaded. +See section <<_cache_coherency>> for more information. .Retrieve Cache Configuration from Software [TIP] diff --git a/docs/datasheet/soc_icache.adoc b/docs/datasheet/soc_icache.adoc index 072c16f9e..47f5215b0 100644 --- a/docs/datasheet/soc_icache.adoc +++ b/docs/datasheet/soc_icache.adoc @@ -37,7 +37,8 @@ The CPU cache(s) should not be implemented when using only processor-internal da .Manual Cache Clear/Reload [NOTE] -By executing the `fence(.i)` instruction the cache is cleared and a reload from main memory is triggered. +By executing the `fence.i` instruction the instruction cache is cleared and reloaded. +See section <<_cache_coherency>> for more information. .Retrieve Cache Configuration from Software [TIP] diff --git a/docs/datasheet/soc_xbus.adoc b/docs/datasheet/soc_xbus.adoc index 766ee2474..77ada41d8 100644 --- a/docs/datasheet/soc_xbus.adoc +++ b/docs/datasheet/soc_xbus.adoc @@ -133,6 +133,12 @@ The **write-allocate** strategy will fetch the entire referenced block from main a cache write-miss. The **write-back** strategy will gather all writes locally inside the cache until the according cache block is about to be replaced. In this case, the entire modified cache block is written back to main memory. +.Manual Cache Flush/Clear/Reload +[NOTE] +By executing a `fence` **or** `fence.i` instruction the XBUS cache is flushed (local modifications are send back to +main memory), cleared (all cache entries are invalidated) and a reloaded (fetching new data from main memory). +See section <<_cache_coherency>> for more information. + .Cached/Uncached Accesses [NOTE] The data cache provides direct accesses (= uncached) to memory in order to access memory-mapped IO. diff --git a/docs/datasheet/soc_xip.adoc b/docs/datasheet/soc_xip.adoc index d33e2af07..9b93a03ad 100644 --- a/docs/datasheet/soc_xip.adoc +++ b/docs/datasheet/soc_xip.adoc @@ -181,7 +181,7 @@ When the cache is implemented, the XIP module operates in **burst mode** utilizi Thus, several bytes (= `XIP_CACHE_BLOCK_SIZE`) are read consecutively from the flash using a single read command. The XIP cache is cleared when the XIP module is disabled (`XIP_CTRL_EN = 0`), when XIP mode is disabled -(`XIP_CTRL_XIP_EN = 0`) or when the CPU issues a `fence(.i)` instruction. +(`XIP_CTRL_XIP_EN = 0`) or when the CPU issues a `fence[.i]` instruction. **Register Map** From 2d00e4fe159dd619d0a1771ef6741674fe5a31e1 Mon Sep 17 00:00:00 2001 From: stnolting Date: Fri, 10 Jan 2025 21:20:32 +0100 Subject: [PATCH 5/6] =?UTF-8?q?=E2=9C=A8=20[cache]=20refine=20fence=20prop?= =?UTF-8?q?agation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit for read-only caches: send fence/synchronization before reloading. for read-write caches: send fence/synchronization after flushing but before reloading --- rtl/core/neorv32_cache.vhd | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/rtl/core/neorv32_cache.vhd b/rtl/core/neorv32_cache.vhd index b7a0f75b4..36170aed3 100644 --- a/rtl/core/neorv32_cache.vhd +++ b/rtl/core/neorv32_cache.vhd @@ -8,11 +8,6 @@ -- the 4 most significant address bits, well as all atomic (reservation set) -- -- operations will always **bypass** the cache resulting in "direct accesses". -- -- -- --- A fence request will first flush the data cache (write back modified blocks to -- --- main memory before invalidating all cache blocks to force a re-fetch from main -- --- memory. After this, the fence request is forwarded to the downstream memory -- --- system. -- --- -- -- Simplified cache architecture ("-->" = direction of access requests): -- -- -- -- Direct Access +----------+ -- @@ -946,9 +941,10 @@ begin when S_FLUSH_START => -- start checking for dirty blocks -- ------------------------------------------------------------ - addr_nxt.idx <= (others => '0'); -- start with index 0 - upret_nxt <= S_FLUSH_CHECK; -- come back to S_FLUSH_CHECK after block upload - state_nxt <= S_FLUSH_READ; + addr_nxt.idx <= (others => '0'); -- start with index 0 + bus_req_o.fence <= bool_to_ulogic_f(READ_ONLY); -- forward fence request + upret_nxt <= S_FLUSH_CHECK; -- come back to S_FLUSH_CHECK after block upload + state_nxt <= S_FLUSH_READ; when S_FLUSH_READ => -- cache read access latency cycle -- ------------------------------------------------------------ @@ -963,7 +959,7 @@ begin else -- move on to next block addr_nxt.idx <= std_ulogic_vector(unsigned(addr.idx) + 1); if (and_reduce_f(addr.idx) = '1') then -- all blocks done? - bus_req_o.fence <= '1'; -- forward fence request to downstream memories + bus_req_o.fence <= not bool_to_ulogic_f(READ_ONLY); -- forward fence request state_nxt <= S_IDLE; else -- go to next block state_nxt <= S_FLUSH_READ; From f2b06ed203d87b9c2ca22ceb527d7eb201139922 Mon Sep 17 00:00:00 2001 From: stnolting Date: Fri, 10 Jan 2025 21:42:27 +0100 Subject: [PATCH 6/6] [rtl] comment typo fix --- rtl/core/neorv32_cpu_control.vhd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rtl/core/neorv32_cpu_control.vhd b/rtl/core/neorv32_cpu_control.vhd index 9c76e88d0..381d5b20b 100644 --- a/rtl/core/neorv32_cpu_control.vhd +++ b/rtl/core/neorv32_cpu_control.vhd @@ -753,8 +753,8 @@ begin -- memory fence operations (execute even if illegal funct3) -- when opcode_fence_c => - ctrl_nxt.if_fence <= exe_engine.ir(instr_funct3_lsb_c); -- fence - ctrl_nxt.lsu_fence <= not exe_engine.ir(instr_funct3_lsb_c); -- fence.i + ctrl_nxt.if_fence <= exe_engine.ir(instr_funct3_lsb_c); -- fence.i + ctrl_nxt.lsu_fence <= not exe_engine.ir(instr_funct3_lsb_c); -- fence exe_engine_nxt.state <= EX_RESTART; -- reset instruction fetch + IPB (actually only required for fence.i) -- FPU: floating-point operations --