Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mini core accel #707

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions app/defines/mini_core_accel_defines.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef MINI_CORE_ACCEL_DEFINES_H
#define MINI_CORE_ACCEL_DEFINES_H

// For detailed address mapping please refer to `big_core_defines.h`
// We use the same memory mapping as the big core for consistent implementation
// but its not obligatory.
// For example: we are using the same ref model for all the cores and we dont want to change addresses of CR's in that core
// cause this will cause us to make changes in thr ref model.

#define D_MEM_BASE 0x00010000
#define CR_MEM_BASE 0x00FE0000

//=============================================================================
// Offset: 0x0000_0000 | Instruction Memory - 64KB :
// | Also known as the "text" memory.
// |
// Ends: 0x0000_FFFF |
//-----------------------------------------------------------------------------
// Offset: 0x0001_0000 | Data Memory - 64KB : .data -> Initialized data
// | .bss -> Uninitialized data
// | .heap -> grows upwards (towards stack)
// Ends: 0x0001_EFFF | .stack -> grows downwards (towards heap)
//-----------------------------------------------------------------------------
// Offset: 0x0002_0000 | Reserved - 16MB : (Mega Bytes!!!!)
// | This memory is reserved for future use
// | where we will utilize the FPGA off-die memory.
// Ends: 0x00FD_FFFF | Up to 64MB on the DE10-lite FPGA board. (TODO - see if we reserve 64MB instead of 16MB)
//-----------------------------------------------------------------------------
// Offset: 0x00FE_0000 | CR Memory - 64KB :
// | This memory is used as MMIO control registers
// | TODO - there are only handful of registers defined here.
// Ends: 0x00FE_FFFF | should be moved to a different memory section
//-----------------------------------------------------------------------------

/* Control registers addressed */
#define CR_CORE2MUL_0 ((volatile int *) (CR_MEM_BASE + 0xF000))
#define CR_CORE2MUL_1 ((volatile int *) (CR_MEM_BASE + 0xF001))
#define CR_CORE2MUL_2 ((volatile int *) (CR_MEM_BASE + 0xF002))
#define CR_CORE2MUL_3 ((volatile int *) (CR_MEM_BASE + 0xF003))
#define CR_CORE2MUL_4 ((volatile int *) (CR_MEM_BASE + 0xF004))
#define CR_CORE2MUL_5 ((volatile int *) (CR_MEM_BASE + 0xF005))
#define CR_CORE2MUL_6 ((volatile int *) (CR_MEM_BASE + 0xF006))
#define CR_CORE2MUL_7 ((volatile int *) (CR_MEM_BASE + 0xF007))

#define CR_MUL2CORE_0 ((volatile int *) (CR_MEM_BASE + 0xF008))
#define CR_MUL2CORE_1 ((volatile int *) (CR_MEM_BASE + 0xF009))
#define CR_MUL2CORE_2 ((volatile int *) (CR_MEM_BASE + 0xF00A))
#define CR_MUL2CORE_3 ((volatile int *) (CR_MEM_BASE + 0xF00B))
#define CR_MUL2CORE_4 ((volatile int *) (CR_MEM_BASE + 0xF00C))
#define CR_MUL2CORE_5 ((volatile int *) (CR_MEM_BASE + 0xF00D))
#define CR_MUL2CORE_6 ((volatile int *) (CR_MEM_BASE + 0xF00E))
#define CR_MUL2CORE_7 ((volatile int *) (CR_MEM_BASE + 0xF00F))




#endif
100 changes: 100 additions & 0 deletions source/mini_core_accel/accelerators/booth_multiplier.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
//-----------------------------------------------------------------------------
// Title : booth multiplier
// Project : mini_core_accelerator
//-----------------------------------------------------------------------------
// File : booth_multiplier
// Original Author :
// Code Owner :
// Created :
//-----------------------------------------------------------------------------
// Description : 8 bit signed multiplier
//-----------------------------------------------------------------------------
`include "macros.vh"

module booth_multiplier
import mini_core_accel_pkg::*;
import mini_core_pkg::*;
(
input logic clock,
input logic rst,
input var t_mul_input_req input_req,
output var t_mul_output_rsp output_rsp

);

t_booth_states state, next_state;
logic [2*NUM_WIDTH:0] acc_multiplier_lsb, next_acc_multiplier_lsb;
logic [$clog2(NUM_WIDTH):0] itr_num, next_itr_num;
int8 multiplicand, next_multiplicand;


// state machine logic and state transitions
always_comb begin: state_machine
next_state = state;
next_itr_num = itr_num;
next_acc_multiplier_lsb = acc_multiplier_lsb;
next_multiplicand = multiplicand;
case(state)
IDLE: begin
if(input_req.valid) begin
next_state = SUB_OR_ADD_AM;
next_multiplicand = input_req.multiplicand; // store multiplicand
next_acc_multiplier_lsb = {{(NUM_WIDTH){1'b0}}, input_req.multiplier, 1'b0};
end
else begin
next_state = IDLE;
end
end
SUB_OR_ADD_AM: begin
if(acc_multiplier_lsb[1:0] == 2'b01) begin
next_acc_multiplier_lsb[2*NUM_WIDTH:NUM_WIDTH+1] = acc_multiplier_lsb[2*NUM_WIDTH:NUM_WIDTH+1] + multiplicand;
end
else if(acc_multiplier_lsb[1:0] == 2'b10) begin
next_acc_multiplier_lsb[2*NUM_WIDTH:NUM_WIDTH+1] = acc_multiplier_lsb[2*NUM_WIDTH:NUM_WIDTH+1] + ~multiplicand + 1'b1;
end
// in any case we go to the next atate
next_state = ARITHMETIC_SHIFT_RIGHT;
next_itr_num = itr_num - 1;
end
ARITHMETIC_SHIFT_RIGHT: begin
next_acc_multiplier_lsb = $signed(acc_multiplier_lsb) >>> 1;
if(itr_num != 0) begin
next_state = SUB_OR_ADD_AM;
end
else begin
next_state = DONE;
end
end
DONE: begin
if(input_req.valid) begin
next_state = IDLE;
end
else begin
next_state = DONE;
end

end
default: next_state = IDLE;
endcase

end

// output logic
assign output_rsp.valid = (state == DONE) ? 1'b1 : 1'b0;
// in our implementation the accumulator has NUM_WIDTH bits. When the multiplicand equals -128 it causes overflow and the result is incorrect.
// I have added a fix by multiplying it by 1.
// FIXME - consider implementing in differente implementation to avoid that
assign output_rsp.result = ((state == DONE) && (multiplicand == -8'd128)) ? ~acc_multiplier_lsb[2*NUM_WIDTH:1] + 1 :
(state == DONE) ? acc_multiplier_lsb[2*NUM_WIDTH:1] :
1'b0;
assign output_rsp.busy = (state == IDLE || state == DONE) ? 1'b0 : 1'b1;

logic rst_itr_num_en;
assign rst_itr_num_en = rst || (state == IDLE);

`MAFIA_DFF(multiplicand, next_multiplicand, clock)
`MAFIA_RST_VAL_DFF(state, next_state, clock, rst, IDLE)
`MAFIA_RST_VAL_DFF(itr_num, next_itr_num, clock, rst_itr_num_en, NUM_WIDTH)
`MAFIA_RST_DFF(acc_multiplier_lsb, next_acc_multiplier_lsb, clock, rst)

endmodule
30 changes: 30 additions & 0 deletions source/mini_core_accel/accelerators/mini_core_accel_farm.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
`include "macros.vh"

module mini_core_accel_farm
import mini_core_pkg::*;
import mini_core_accel_pkg::*;
#(parameter MUL_NUM = 8)
(
input logic clock,
input logic rst,
input var t_core2mul_req core2mul_req,
output var t_mul2core_rsp mul2core_rsp
);

genvar mul_index;

generate
for(mul_index = 0; mul_index < MUL_NUM; mul_index++) begin : multiplier_inst
booth_multiplier booth_multiplier (
.clock(clock),
.rst(rst),
.input_req(core2mul_req[mul_index]),
.output_rsp(mul2core_rsp[mul_index])
);

end
endgenerate


endmodule

157 changes: 157 additions & 0 deletions source/mini_core_accel/mini_core_accel_cr_mem.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
//-----------------------------------------------------------------------------
// Title : mini_core_accell_cr_mem
// Project : fpga_mafia
//-----------------------------------------------------------------------------
// File : mini_core_accell_cr_mem.sv
// Original Author :
// Code Owner :
// Adviser :
// Created :
//-----------------------------------------------------------------------------
// Description :
// The CR memory module - Control Registers memory.
// contains Flop based memory for the Control Registers.
// The memory is accessed by the core and the accelerator farm.


`include "macros.vh"

module mini_core_accell_cr_mem
import mini_core_accel_pkg::*;
(
input logic Clk,
input logic Rst,

// Core interface
input logic [31:0] data,
input logic [31:0] address,
input logic wren,
input logic rden,
output logic [31:0] q,

// Accelerators interface
input var t_mul2core_rsp mul2core_rsp,
output var t_core2mul_req core2mul_req
);

t_accel_cr accel_cr;
t_accel_cr next_accel_cr;

`MAFIA_DFF(accel_cr, next_accel_cr, Clk)

logic [31:0] pre_q;

// write to accel_cr
always_comb begin :wr_to_accel_cr
next_accel_cr = Rst ? '0 : accel_cr;
if(wren) begin // writing data from core to accelerators.
unique casez (address)
// each CR_CORE2MUL_I concantinated with {start, multiplicand, multiplier}
CR_CORE2MUL_0 : next_accel_cr.cr_core2mul_0 = data[16:0];
CR_CORE2MUL_1 : next_accel_cr.cr_core2mul_1 = data[16:0];
CR_CORE2MUL_2 : next_accel_cr.cr_core2mul_2 = data[16:0];
CR_CORE2MUL_3 : next_accel_cr.cr_core2mul_3 = data[16:0];
CR_CORE2MUL_4 : next_accel_cr.cr_core2mul_4 = data[16:0];
CR_CORE2MUL_5 : next_accel_cr.cr_core2mul_5 = data[16:0];
CR_CORE2MUL_6 : next_accel_cr.cr_core2mul_6 = data[16:0];
CR_CORE2MUL_7 : next_accel_cr.cr_core2mul_7 = data[16:0];
default : ; // do nothing
endcase
// hard wired
next_accel_cr.cr_mul2core_0 = {mul2core_rsp.mul2core[0].busy, mul2core_rsp.mul2core[0].valid, mul2core_rsp.mul2core[0].result};
next_accel_cr.cr_mul2core_1 = {mul2core_rsp.mul2core[1].busy, mul2core_rsp.mul2core[1].valid, mul2core_rsp.mul2core[1].result};
next_accel_cr.cr_mul2core_2 = {mul2core_rsp.mul2core[2].busy, mul2core_rsp.mul2core[2].valid, mul2core_rsp.mul2core[2].result};
next_accel_cr.cr_mul2core_3 = {mul2core_rsp.mul2core[3].busy, mul2core_rsp.mul2core[3].valid, mul2core_rsp.mul2core[3].result};
next_accel_cr.cr_mul2core_4 = {mul2core_rsp.mul2core[4].busy, mul2core_rsp.mul2core[4].valid, mul2core_rsp.mul2core[4].result};
next_accel_cr.cr_mul2core_5 = {mul2core_rsp.mul2core[5].busy, mul2core_rsp.mul2core[5].valid, mul2core_rsp.mul2core[5].result};
next_accel_cr.cr_mul2core_6 = {mul2core_rsp.mul2core[6].busy, mul2core_rsp.mul2core[6].valid, mul2core_rsp.mul2core[6].result};
next_accel_cr.cr_mul2core_7 = {mul2core_rsp.mul2core[7].busy, mul2core_rsp.mul2core[7].valid, mul2core_rsp.mul2core[7].result};

end
end

// reading data
always_comb begin : read_from_accel_cr
pre_q = 0;
if(rden) begin
unique casez(address)
CR_MUL2CORE_0 : pre_q = accel_cr.cr_mul2core_0;
CR_MUL2CORE_1 : pre_q = accel_cr.cr_mul2core_1;
CR_MUL2CORE_2 : pre_q = accel_cr.cr_mul2core_2;
CR_MUL2CORE_3 : pre_q = accel_cr.cr_mul2core_3;
CR_MUL2CORE_4 : pre_q = accel_cr.cr_mul2core_4;
CR_MUL2CORE_5 : pre_q = accel_cr.cr_mul2core_5;
CR_MUL2CORE_6 : pre_q = accel_cr.cr_mul2core_6;
CR_MUL2CORE_7 : pre_q = accel_cr.cr_mul2core_7;
default : ; // do nothing
endcase

// hard wired from cr to multipliers
{core2mul_req.core2mul[0].valid, core2mul_req.core2mul[0].multiplier, core2mul_req.core2mul[0].multiplicand} = {accel_cr.cr_core2mul_0[16],
accel_cr.cr_core2mul_0[7:0],
accel_cr.cr_core2mul_0[15:8]};


{core2mul_req.core2mul[1].valid, core2mul_req.core2mul[1].multiplier, core2mul_req.core2mul[1].multiplicand} = {accel_cr.cr_core2mul_1[16],
accel_cr.cr_core2mul_1[7:0],
accel_cr.cr_core2mul_1[15:8]};

{core2mul_req.core2mul[2].valid, core2mul_req.core2mul[2].multiplier, core2mul_req.core2mul[2].multiplicand} = {accel_cr.cr_core2mul_2[16],
accel_cr.cr_core2mul_2[7:0],
accel_cr.cr_core2mul_2[15:8]};

{core2mul_req.core2mul[3].valid, core2mul_req.core2mul[3].multiplier, core2mul_req.core2mul[3].multiplicand} = {accel_cr.cr_core2mul_3[16],
accel_cr.cr_core2mul_3[7:0],
accel_cr.cr_core2mul_3[15:8]};

{core2mul_req.core2mul[4].valid, core2mul_req.core2mul[4].multiplier, core2mul_req.core2mul[4].multiplicand} = {accel_cr.cr_core2mul_4[16],
accel_cr.cr_core2mul_4[7:0],
accel_cr.cr_core2mul_4[15:8]};

{core2mul_req.core2mul[5].valid, core2mul_req.core2mul[5].multiplier, core2mul_req.core2mul[5].multiplicand} = {accel_cr.cr_core2mul_5[16],
accel_cr.cr_core2mul_5[7:0],
accel_cr.cr_core2mul_5[15:8]};

{core2mul_req.core2mul[6].valid, core2mul_req.core2mul[6].multiplier, core2mul_req.core2mul[6].multiplicand} = {accel_cr.cr_core2mul_6[16],
accel_cr.cr_core2mul_6[7:0],
accel_cr.cr_core2mul_6[15:8]};

{core2mul_req.core2mul[7].valid, core2mul_req.core2mul[7].multiplier, core2mul_req.core2mul[7].multiplicand} = {accel_cr.cr_core2mul_7[16],
accel_cr.cr_core2mul_7[7:0],
accel_cr.cr_core2mul_7[15:8]};
/*
// TODO possible refactor is needed to use less lines
core2mul_req.core2mul0.valid = accel_cr.cr_core2mul_0[16];
core2mul_req.core2mul1.valid = accel_cr.cr_core2mul_1[16];
core2mul_req.core2mul2.valid = accel_cr.cr_core2mul_2[16];
core2mul_req.core2mul3.valid = accel_cr.cr_core2mul_3[16];
core2mul_req.core2mul4.valid = accel_cr.cr_core2mul_4[16];
core2mul_req.core2mul5.valid = accel_cr.cr_core2mul_5[16];
core2mul_req.core2mul6.valid = accel_cr.cr_core2mul_6[16];
core2mul_req.core2mul7.valid = accel_cr.cr_core2mul_7[16];
// multiplier is recieving the multiplier
core2mul_req.core2mul0.multiplier = accel_cr.cr_core2mul_0[7:0];
core2mul_req.core2mul1.multiplier = accel_cr.cr_core2mul_1[7:0];
core2mul_req.core2mul2.multiplier = accel_cr.cr_core2mul_2[7:0];
core2mul_req.core2mul3.multiplier = accel_cr.cr_core2mul_3[7:0];
core2mul_req.core2mul4.multiplier = accel_cr.cr_core2mul_4[7:0];
core2mul_req.core2mul5.multiplier = accel_cr.cr_core2mul_5[7:0];
core2mul_req.core2mul6.multiplier = accel_cr.cr_core2mul_6[7:0];
core2mul_req.core2mul7.multiplier = accel_cr.cr_core2mul_7[7:0];
// multiplier is recieving the multiplicand
core2mul_req.core2mul0.multiplicand = accel_cr.cr_core2mul_0[15:8];
core2mul_req.core2mul1.multiplicand = accel_cr.cr_core2mul_1[15:8];
core2mul_req.core2mul2.multiplicand = accel_cr.cr_core2mul_2[15:8];
core2mul_req.core2mul3.multiplicand = accel_cr.cr_core2mul_3[15:8];
core2mul_req.core2mul4.multiplicand = accel_cr.cr_core2mul_4[15:8];
core2mul_req.core2mul5.multiplicand = accel_cr.cr_core2mul_5[15:8];
core2mul_req.core2mul6.multiplicand = accel_cr.cr_core2mul_6[15:8];
core2mul_req.core2mul7.multiplicand = accel_cr.cr_core2mul_7[15:8];
*/

end
end

`MAFIA_DFF(q, pre_q, Clk)

endmodule // Module
Loading
Loading