Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add neuron_mac unit #727

Merged
merged 12 commits into from
Aug 22, 2024
15 changes: 15 additions & 0 deletions app/defines/mini_core_accel_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@

/* Control registers addressed */
//FIXME refactor to be more compact

//=====================================
// define CR's for INT8 multiplier
//=====================================
#define CR_CORE2MUL_INT8_MULTIPLICAND_0 ((volatile int *) (CR_MEM_BASE + 0xF000))
#define CR_CORE2MUL_INT8_MULTIPLIER_0 ((volatile int *) (CR_MEM_BASE + 0xF001))
#define CR_CORE2MUL_INT8_MULTIPLICAND_1 ((volatile int *) (CR_MEM_BASE + 0xF002))
Expand Down Expand Up @@ -103,7 +107,18 @@
#define CR_MUL2CORE_INT8_DONE_15 ((volatile int *) (CR_MEM_BASE + 0xF06F))


//==================================
// define CR's for neuron_mac
//==================================
#define NEURON_MAC_BIAS0 ((volatile int *) (CR_MEM_BASE + 0xF100))
#define NEURON_MAC_BIAS1 ((volatile int *) (CR_MEM_BASE + 0xF101))
#define NEURON_MAC_RESULT0 ((volatile int *) (CR_MEM_BASE + 0xF102))
#define NEURON_MAC_RESULT1 ((volatile int *) (CR_MEM_BASE + 0xF103))

// used for debug purposes
#define CR_DEBUG_0 ((volatile int *) (CR_MEM_BASE + 0xFF00))
#define CR_DEBUG_1 ((volatile int *) (CR_MEM_BASE + 0xFF01))
#define CR_DEBUG_2 ((volatile int *) (CR_MEM_BASE + 0xFF02))
#define CR_DEBUG_3 ((volatile int *) (CR_MEM_BASE + 0xFF03))

#endif
15 changes: 14 additions & 1 deletion source/mini_core_accel/accelerators/mini_core_accel_farm.sv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
module mini_core_accel_farm
import mini_core_pkg::*;
import mini_core_accel_pkg::*;
#(parameter MUL_NUM = 16)
#(parameter MUL_NUM = 16, NEURON_MAC_NUM = 2)
(
input logic clock,
input logic rst,
Expand All @@ -25,6 +25,19 @@ generate
end
endgenerate

genvar neuron_mac_index;
generate
for(neuron_mac_index = 0; neuron_mac_index < NEURON_MAC_NUM; neuron_mac_index++) begin : neuron_mac_inst
neuron_mac neuron_mac (
.clk(clock),
.rst(rst),
.neuron_mac_input(accel_farm_input.int8_mul2neuron_mac[neuron_mac_index]),
.neuron_mac_output(accel_farm_output.neuron_mac_result[neuron_mac_index])
);

end
endgenerate


endmodule

32 changes: 32 additions & 0 deletions source/mini_core_accel/accelerators/neuron_mac.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//==============================================================================
// perform addition of 8 multiplier + bias, then apply ReLu and saturation
//==============================================================================

`include "macros.vh"

module neuron_mac
import mini_core_accel_pkg::*;
(

input logic clk,
input logic rst,
input var t_neuron_mac_input neuron_mac_input,
output var t_neuron_mac_output neuron_mac_output
);

logic signed [19:0] mac_result; // 2^16 * 2^3 + bias
logic signed [19:0] ReLu_result;

assign mac_result = $signed(neuron_mac_input.mul_result[0]) + $signed(neuron_mac_input.mul_result[1]) +
$signed(neuron_mac_input.mul_result[2]) + $signed(neuron_mac_input.mul_result[3]) +
$signed(neuron_mac_input.mul_result[4]) + $signed(neuron_mac_input.mul_result[5]) +
$signed(neuron_mac_input.mul_result[6]) + $signed(neuron_mac_input.mul_result[7]) +
$signed(neuron_mac_input.bias);

assign ReLu_result = (mac_result > 0) ? mac_result : 'b0;

assign neuron_mac_output.int8_result = (ReLu_result > 20'sd127) ? 8'd127 : (ReLu_result < -20'sd128) ? -8'sd128 : ReLu_result[7:0];



endmodule
55 changes: 55 additions & 0 deletions source/mini_core_accel/accelerators/pipe_line_mul.sv
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@

// implementation of 8 stage pipe lined booth multiplier

`include "macros.vh"

module pipe_line_mul
import mini_core_accel_pkg::*;
(

input logic clk,
input logic rst,
input logic start,
input logic [7:0] multiplier,
input logic [7:0] multiplicand,
output logic ready,
output logic [15:0] result
);

logic [16:0] acc [8:1];
logic [16:0] next_acc [7:0];
logic [7:0] stored_multiplicand [8:0];
logic [8:0] pipe_is_full;
logic [16:0] init_acc;


assign pipe_is_full[0] = (rst || !start) ? 1'b0 : 1'b1;
assign ready = pipe_is_full[8]; // only after 8 stages the first result is ready

// first stage (count from zero)
assign stored_multiplicand[0] = (start) ? multiplicand : 'b0;
assign init_acc = (rst) ? 'b0 : (start) ? {8'b0, multiplier, 1'b0} : 'b0;
assign next_acc[0] = (init_acc[1:0] == 2'b01) ? $signed({init_acc[16:9]+stored_multiplicand[0], init_acc[8:1], init_acc[0]}) >>> 1 :
(init_acc[1:0] == 2'b10) ? $signed({init_acc[16:9]-stored_multiplicand[0], init_acc[8:1], init_acc[0]}) >>> 1 :
$signed(init_acc) >>> 1 ;

`MAFIA_DFF(acc[1], next_acc[0], clk)
`MAFIA_DFF(pipe_is_full[1], pipe_is_full[0], clk)
`MAFIA_DFF(stored_multiplicand[1], stored_multiplicand[0], clk)

genvar stage_num;
generate
for(stage_num = 1; stage_num <=7; stage_num++) begin
assign next_acc[stage_num] = (acc[stage_num][1:0] == 2'b01) ? $signed({acc[stage_num][16:9]+stored_multiplicand[stage_num], acc[stage_num][8:1], acc[stage_num][0]}) >>> 1 :
(acc[stage_num][1:0] == 2'b10) ? $signed({acc[stage_num][16:9]-stored_multiplicand[stage_num], acc[stage_num][8:1], acc[stage_num][0]}) >>> 1 :
$signed(acc[stage_num]) >>> 1 ;
`MAFIA_DFF(acc[stage_num+1], next_acc[stage_num], clk)
`MAFIA_DFF(pipe_is_full[stage_num+1], pipe_is_full[stage_num], clk)
`MAFIA_DFF(stored_multiplicand[stage_num+1], stored_multiplicand[stage_num], clk)
end
endgenerate

assign result = (!ready) ? 'b0 : (stored_multiplicand[8] == -8'd128) ? ~acc[8][16:1] + 1 : acc[8][16:1];

endmodule

48 changes: 40 additions & 8 deletions source/mini_core_accel/mini_core_accel_cr_mem.sv
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,23 @@ import mini_core_accel_pkg::*;
output var t_accel_farm_input accel_farm_input
);

integer i,j;
integer i,j,k,l;

t_accel_cr_int8_multipliers accel_cr, next_accel_cr; // define a struct of structs for int8 multipliers
t_cr_debug cr_debug, next_cr_debug; // FIXME - remove cr for debug when we will have ref model
t_accel_cr_neuron_mac cr_neuron_mac, next_cr_neuron_mac;
t_cr_debug cr_debug, next_cr_debug; // FIXME - remove cr for debug when we will have ref model

`MAFIA_DFF(accel_cr, next_accel_cr, Clk)
`MAFIA_DFF(cr_neuron_mac, next_cr_neuron_mac, Clk)
`MAFIA_DFF(cr_debug, next_cr_debug, Clk)

logic [31:0] pre_q;

// write to accel_cr
always_comb begin :wr_to_accel_cr
next_accel_cr = Rst ? '0 : accel_cr;
next_cr_debug = Rst ? '0 : cr_debug;
next_accel_cr = Rst ? '0 : accel_cr;
next_cr_neuron_mac = Rst ? '0 : cr_neuron_mac;
next_cr_debug = Rst ? '0 : cr_debug;
if(wren) begin // writing data from core to accelerators.
unique casez (address)
// multiplicand and multiplier data comming from the core to the multiplier
Expand Down Expand Up @@ -101,7 +104,16 @@ always_comb begin :wr_to_accel_cr
CR_CORE2MUL_INT8_MULTIPLICANT_15 : next_accel_cr.cr_int8_multiplier[15].cr_core2mul_multiplicant_int8 = data[7:0];
CR_CORE2MUL_INT8_MULTIPLIER_15 : next_accel_cr.cr_int8_multiplier[15].cr_core2mul_multiplier_int8 = data[7:0];

CR_DEBUG_0 : next_cr_debug.cr_debug_0 = data[31:0];
// neuron_mac0
NEURON_MAC_BIAS0 : next_cr_neuron_mac.neuron_mac_bias0 = data[7:0];
// neuron_mac1
NEURON_MAC_BIAS1 : next_cr_neuron_mac.neuron_mac_bias1 = data[7:0];

// cr's for debug
CR_DEBUG_0 : next_cr_debug.cr_debug_0 = data[31:0];
CR_DEBUG_1 : next_cr_debug.cr_debug_1 = data[31:0];
CR_DEBUG_2 : next_cr_debug.cr_debug_2 = data[31:0];
CR_DEBUG_3 : next_cr_debug.cr_debug_3 = data[31:0];
default : ; // do nothing
endcase
end
Expand All @@ -110,7 +122,10 @@ always_comb begin :wr_to_accel_cr
{next_accel_cr.cr_int8_multiplier[i].cr_mul2core_done, next_accel_cr.cr_int8_multiplier[i].cr_mul2core_result} =
{accel_farm_output.mul2core_int8[i].done, accel_farm_output.mul2core_int8[i].result};
end

// hard wired result from neuron mac
next_cr_neuron_mac.neuron_mac_result0 = accel_farm_output.neuron_mac_result[0].int8_result;
next_cr_neuron_mac.neuron_mac_result1 = accel_farm_output.neuron_mac_result[1].int8_result;

end

// reading data
Expand Down Expand Up @@ -167,7 +182,16 @@ always_comb begin : read_from_accel_cr
CR_MUL2CORE_INT8_15 : pre_q = {16'b0, accel_cr.cr_int8_multiplier[15].cr_mul2core_result};
CR_MUL2CORE_INT8_DONE_15 : pre_q = {31'b0, accel_cr.cr_int8_multiplier[15].cr_mul2core_done};

CR_DEBUG_0 : pre_q = cr_debug.cr_debug_0;
// neuron_mac0
NEURON_MAC_RESULT0 : pre_q = {24'b0, cr_neuron_mac.neuron_mac_result0};
// neuron_mac1
NEURON_MAC_RESULT1 : pre_q = {24'b0, cr_neuron_mac.neuron_mac_result1};

// cr's for debug
CR_DEBUG_0 : pre_q = cr_debug.cr_debug_0;
CR_DEBUG_1 : pre_q = cr_debug.cr_debug_1;
CR_DEBUG_2 : pre_q = cr_debug.cr_debug_2;
CR_DEBUG_3 : pre_q = cr_debug.cr_debug_3;

default : ; // do nothing
endcase
Expand All @@ -177,7 +201,15 @@ always_comb begin : read_from_accel_cr
{accel_farm_input.core2mul_int8[j].multiplicand, accel_farm_input.core2mul_int8[j].multiplier} =
{accel_cr.cr_int8_multiplier[j].cr_core2mul_multiplicant_int8, accel_cr.cr_int8_multiplier[j].cr_core2mul_multiplier_int8};
end

// hard wired neuron_mac results from multipliers
accel_farm_input.int8_mul2neuron_mac[0].bias = cr_neuron_mac.neuron_mac_bias0;
accel_farm_input.int8_mul2neuron_mac[1].bias = cr_neuron_mac.neuron_mac_bias1;

for(int k=0; k<NEURON_MAC_NUM; k++) begin
for(l=0; l< INT8_MULTIPLIER_NUM/2; l++) begin
accel_farm_input.int8_mul2neuron_mac[k].mul_result[l] = accel_cr.cr_int8_multiplier[l+8*k].cr_mul2core_result;
end
end
end

`MAFIA_RST_DFF(q, pre_q, Clk, Rst)
Expand Down
70 changes: 63 additions & 7 deletions source/mini_core_accel/mini_core_accel_pkg.sv
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,15 @@
package mini_core_accel_pkg;

parameter INT8_MULTIPLIER_NUM = 16;
parameter NEURON_MAC_NUM = 2;
parameter NUM_WIDTH_INT8 = 8;
typedef logic [7:0] int8;
typedef logic [15:0] int16;


//-------------------------
// int8 multiplier structs
//-------------------------
typedef enum {
PRE_START,
COMPUTE,
Expand All @@ -25,15 +30,21 @@ typedef struct packed {
int16 result;
}t_mul_int8_output;

typedef struct packed { // {multiplicand, multiplier}
t_mul_int8_input [INT8_MULTIPLIER_NUM-1:0] core2mul_int8;
}t_accel_farm_input;
//-------------------------
// neuron mac structs
//-------------------------
typedef struct packed {
logic [7:0][15:0] mul_result; // result fron int8_multiplier
logic [7:0] bias;
}t_neuron_mac_input;

// response from multiplier
typedef struct packed {
t_mul_int8_output [INT8_MULTIPLIER_NUM-1:0] mul2core_int8;
}t_accel_farm_output;
logic [7:0] int8_result;
}t_neuron_mac_output;

//-------------------------
// cr structs
//-------------------------
typedef struct packed {
logic [7:0] cr_core2mul_multiplicant_int8;
logic [7:0] cr_core2mul_multiplier_int8;
Expand All @@ -45,18 +56,51 @@ typedef struct packed {
t_cr_int8_multiplier [INT8_MULTIPLIER_NUM-1:0] cr_int8_multiplier;
}t_accel_cr_int8_multipliers;

typedef struct packed{
logic [7:0] neuron_mac_bias0;
logic [7:0] neuron_mac_bias1;
logic [7:0] neuron_mac_result0;
logic [7:0] neuron_mac_result1;
}t_accel_cr_neuron_mac;

//-------------------------
// Debug structs
//-------------------------
// FIXME used for degub purposes untill we will have dedicated ref model
typedef struct packed {
logic [31:0] cr_debug_0;
logic [31:0] cr_debug_1;
logic [31:0] cr_debug_2;
logic [31:0] cr_debug_3;
} t_cr_debug;

//----------------------------
// acceleration farm structs
//----------------------------
// data connecting CR to dedicated unit
typedef struct packed {
t_mul_int8_input [INT8_MULTIPLIER_NUM-1:0] core2mul_int8; // {multiplicand, multiplier}
t_neuron_mac_input [NEURON_MAC_NUM-1:0] int8_mul2neuron_mac; // {mul_result[7:0][15:0], bias}
}t_accel_farm_input;

// response from multiplier
typedef struct packed {
t_mul_int8_output [INT8_MULTIPLIER_NUM-1:0] mul2core_int8;
t_neuron_mac_output [NEURON_MAC_NUM-1:0] neuron_mac_result;
}t_accel_farm_output;



// define CR's
parameter CR_MEM_OFFSET = 'h00FE_0000;
parameter CR_MEM_REGION_FLOOR = CR_MEM_OFFSET;
parameter CR_MEM_REGION_ROOF = 'h00FF_0000 - 1;

// define CR's for INT8 multiplier

//FIXME refactor to be more compact
//=====================================
// define CR's for INT8 multiplier
//=====================================
parameter CR_CORE2MUL_INT8_MULTIPLICANT_0 = CR_MEM_OFFSET + 'hf000;
parameter CR_CORE2MUL_INT8_MULTIPLIER_0 = CR_MEM_OFFSET + 'hf001;
parameter CR_CORE2MUL_INT8_MULTIPLICANT_1 = CR_MEM_OFFSET + 'hf002;
Expand Down Expand Up @@ -124,7 +168,19 @@ parameter CR_MUL2CORE_INT8_DONE_14 = CR_MEM_OFFSET + 'hf06d;
parameter CR_MUL2CORE_INT8_15 = CR_MEM_OFFSET + 'hf06e;
parameter CR_MUL2CORE_INT8_DONE_15 = CR_MEM_OFFSET + 'hf06f;


//==================================
// define CR's for neuron_mac
//==================================
parameter NEURON_MAC_BIAS0 = CR_MEM_OFFSET + 'hf100;
parameter NEURON_MAC_BIAS1 = CR_MEM_OFFSET + 'hf101;
parameter NEURON_MAC_RESULT0 = CR_MEM_OFFSET + 'hf102;
parameter NEURON_MAC_RESULT1 = CR_MEM_OFFSET + 'hf103;

// used for debug purposes
parameter CR_DEBUG_0 = CR_MEM_OFFSET + 'hff00;
parameter CR_DEBUG_1 = CR_MEM_OFFSET + 'hff01;
parameter CR_DEBUG_2 = CR_MEM_OFFSET + 'hff02;
parameter CR_DEBUG_3 = CR_MEM_OFFSET + 'hff03;

endpackage
2 changes: 2 additions & 0 deletions source/mini_core_accel/mini_core_accel_rtl_list.f
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,7 @@

// Accelerator farm files
../../../source/mini_core_accel/accelerators/multiplier_int8.sv
../../../source/mini_core_accel/accelerators/pipe_line_mul.sv
../../../source/mini_core_accel/accelerators/neuron_mac.sv
../../../source/mini_core_accel/accelerators/mini_core_accel_farm.sv

2 changes: 1 addition & 1 deletion source/mini_core_accel/mini_core_accel_top.sv
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ mini_core_accel_mem_wrap mini_core_accel_mem_wrap(
);

mini_core_accel_farm
#(.MUL_NUM(INT8_MULTIPLIER_NUM)) // FIXME - parametrize
#(.MUL_NUM(INT8_MULTIPLIER_NUM), .NEURON_MAC_NUM(NEURON_MAC_NUM))
mini_core_accel_farm
(
.clock (Clock),
Expand Down
4 changes: 3 additions & 1 deletion verif/mini_core_accel/file_list/mini_core_accel_verif_list.f
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@
../../../verif/rv32i_ref/tb/rv32i_ref_pkg.sv
../../../verif/rv32i_ref/tb/rv32i_ref.sv
../../../verif/mini_core_accel/tb/mini_core_accel_tb.sv
../../../verif/mini_core_accel/tb/multiplier_int8_tb.sv
../../../verif/mini_core_accel/tb/multiplier_int8_tb.sv
../../../verif/mini_core_accel/tb/pipe_line_mul_tb.sv
../../../verif/mini_core_accel/tb/neuron_mac_tb.sv
Loading
Loading