벡터 코어 (GEMV)¶
4 개의 병렬 μV-core. 각 코어는 자기 L1 캐시 슬라이스에서 활성화를, HP2/HP3 (포트당 32 INT4/clk) 에서 가중치를 읽습니다. 부분곱은 3 단 reduction tree 로, 이후 Emax 정렬 BF16 누산기로 들어갑니다. 단일 토큰 디코드 경로가 가중치 대역폭에 제한되는 자기회귀 디코딩에서 이 엔진이 주역입니다.
더 보기
- pccx: Parallel Compute Core eXecutor
μV-core 위치를 보여주는 v001 블록도.
모듈¶
GEMV_top.sv— μV-core 래퍼. 4 레인을 인스턴스화하고 가중치 FIFO + L1 캐시 슬라이스에 연결.GEMV_generate_lut.sv— 레인별 LUT 로 INT4 가중치 디코드 / 부호 확장.GEMV_Vec_Matrix_MUL.svh— multiply-reduce 단계용 매개변수 헤더.GEMV_reduction_branch.sv— reduction tree 의 한 가지 (부분곱 쌍 합산).GEMV_reduction.sv— 4 가지를 스칼라로 합치는 최상위 reduction tree.GEMV_accumulate.sv— 최종 스칼라를 받는 Emax 정렬 BF16 누산기.
소스¶
GEMV_top.sv
`timescale 1ns / 1ps
`include "GEMV_Vec_Matrix_MUL.svh"
`include "GLOBAL_CONST.svh"
// weight size = 4bit
// feature_map size = bf16
module GEMV_top
import vec_core_pkg::*;
#(
parameter gemv_cfg_t param = VecCoreDefaultCfg,
parameter A = 0,
parameter B = 1,
parameter C = 2,
parameter D = 3
) (
input logic clk,
input logic rst_n,
input logic IN_weight_valid_A,
input logic IN_weight_valid_B,
input logic IN_weight_valid_C,
input logic IN_weight_valid_D,
input logic [param.weight_width - 1:0] IN_weight_A[0:param.weight_cnt -1],
input logic [param.weight_width - 1:0] IN_weight_B[0:param.weight_cnt -1],
input logic [param.weight_width - 1:0] IN_weight_C[0:param.weight_cnt -1],
input logic [param.weight_width - 1:0] IN_weight_D[0:param.weight_cnt -1],
output logic OUT_weight_ready_A,
output logic OUT_weight_ready_B,
output logic OUT_weight_ready_C,
output logic OUT_weight_ready_D,
input logic [param.fixed_mant_width-1:0] IN_fmap_broadcast [0:param.fmap_cache_out_cnt-1],
input logic IN_fmap_broadcast_valid,
input logic [16:0] IN_num_recur,
// e_max (from Cache for Normalization alignment)
input logic [dtype_pkg::Bf16ExpWidth-1:0] IN_cached_emax_out[0:param.fmap_cache_out_cnt-1],
input logic IN_activated_lane[0:param.num_gemv_pipeline-1],
output logic [param.fmap_type_mixed_precision - 1:0] OUT_final_fmap_A,
output logic [param.fmap_type_mixed_precision - 1:0] OUT_final_fmap_B,
output logic [param.fmap_type_mixed_precision - 1:0] OUT_final_fmap_C,
output logic [param.fmap_type_mixed_precision - 1:0] OUT_final_fmap_D,
output logic OUT_result_valid_A,
output logic OUT_result_valid_B,
output logic OUT_result_valid_C,
output logic OUT_result_valid_D
);
logic [param.fixed_mant_width+2:0] fmap_LUT_wire[0:param.fmap_cache_out_cnt-1][0:param.weight_width-1];
logic fmap_ready_wire;
GEMV_generate_lut #(
.param(VecCoreDefaultCfg)
) u_GEMV_generate_lut (
.IN_fmap_broadcast(IN_fmap_broadcast),
.IN_fmap_broadcast_valid(IN_fmap_broadcast_valid),
.IN_cached_emax_out(IN_cached_emax_out),
.OUT_fmap_LUT (fmap_LUT_wire),
.OUT_fmap_ready(fmap_ready_wire)
);
GEMV_reduction_branch #(
.param(VecCoreDefaultCfg)
) u_GEMV_reduction_branch_A (
.clk (clk),
.rst_n(rst_n),
.IN_weight_valid(IN_weight_valid_A),
.IN_weight(IN_weight_A),
.fmap_ready(fmap_ready_wire),
.IN_num_recur(IN_num_recur), // shape x * y * z
.IN_activated_lane(IN_activated_lane[A]),
.IN_fmap_LUT(fmap_LUT_wire),
.OUT_GEMV_result_vector(OUT_final_fmap_A),
.OUT_valid(OUT_result_valid_A)
);
GEMV_reduction_branch #(
.param(VecCoreDefaultCfg)
) u_GEMV_reduction_branch_B (
.clk (clk),
.rst_n(rst_n),
.IN_weight_valid(IN_weight_valid_B),
.IN_weight(IN_weight_B),
.fmap_ready(fmap_ready_wire),
.IN_num_recur(IN_num_recur), // shape x * y * z
.IN_activated_lane(IN_activated_lane[B]),
.IN_fmap_LUT(fmap_LUT_wire),
.OUT_GEMV_result_vector(OUT_final_fmap_B),
.OUT_valid(OUT_result_valid_B)
);
GEMV_reduction_branch #(
.param(VecCoreDefaultCfg)
) u_GEMV_reduction_branch_C (
.clk (clk),
.rst_n(rst_n),
.IN_weight_valid(IN_weight_valid_C),
.IN_weight(IN_weight_C),
.fmap_ready(fmap_ready_wire),
.IN_num_recur(IN_num_recur), // shape x * y * z
.IN_activated_lane(IN_activated_lane[C]),
.IN_fmap_LUT(fmap_LUT_wire),
.OUT_GEMV_result_vector(OUT_final_fmap_C),
.OUT_valid(OUT_result_valid_C)
);
GEMV_reduction_branch #(
.param(VecCoreDefaultCfg)
) u_GEMV_reduction_branch_D (
.clk (clk),
.rst_n(rst_n),
.IN_weight_valid(IN_weight_valid_D),
.IN_weight(IN_weight_D),
.fmap_ready(fmap_ready_wire),
.IN_num_recur(IN_num_recur), // shape x * y * z
.IN_activated_lane(IN_activated_lane[D]),
.IN_fmap_LUT(fmap_LUT_wire),
.OUT_GEMV_result_vector(OUT_final_fmap_D),
.OUT_valid(OUT_result_valid_D)
);
endmodule
GEMV_generate_lut.sv
`timescale 1ns / 1ps
`include "GEMV_Vec_Matrix_MUL.svh"
`include "GLOBAL_CONST.svh"
// Descending order
module GEMV_generate_lut
import vec_core_pkg::*;
#(
parameter gemv_cfg_t param = VecCoreDefaultCfg
) (
input logic [param.fixed_mant_width-1:0] IN_fmap_broadcast [0:param.fmap_cache_out_cnt-1],
input logic IN_fmap_broadcast_valid,
// e_max (from Cache for Normalization alignment)
input logic [device_pkg::FmapType-1:0] IN_cached_emax_out[0:param.fmap_cache_out_cnt-1],
output logic [param.param.fixed_mant_width+2:0] OUT_fmap_LUT[0:param.fmap_cache_out_cnt-1][0:param.weight_width-1],
output logic OUT_fmap_ready
);
genvar idx, w;
generate
for (idx = 0; idx < param.fmap_cache_out_cnt; idx++) begin : fmap_lut_pre_cal
wire signed [29:0] F;
assign F = {{3{IN_fmap_broadcast[idx][26]}}, IN_fmap_broadcast[idx]};
for (w = 0; w < 16; w++) begin : lut_entry
// w - 8 = INT4 range (-8 ~ 7)
assign OUT_fmap_low_LUT[idx][w] = F * $signed(5'(w) - 5'd8);
end
end
endgenerate
endmodule
GEMV_Vec_Matrix_MUL.svh
//`define //
`define WEIGHT_HP_PORT_SIZE 512
`define FEATURE_MAP_HPC_PORT_SIZE 256
//`define INPUT_weight
`define IS_NEGATIVE_NUMBER 1
`define WEIGHT_SIZE 4
`define FEAUTRE_MAP_SIZE 16
`define AXI_WEIGHT_PORT_CNT 4
`define GEMV_LOW 0
`define GEMV_HIGH 1
`define GEMV_MAX_RES_VEC 2048
`define GEMV_QUARTER_ONE 0
`define GEMV_QUARTER_TWO 1
`define GEMV_QUARTER_THREE 2
`define GEMV_QUARTER_FOUR 3
GEMV_reduction_branch.sv
`timescale 1ns / 1ps
`include "GEMV_Vec_Matrix_MUL.svh"
`include "GLOBAL_CONST.svh"
module GEMV_reduction_branch
import vec_core_pkg::*;
#(
parameter gemv_cfg_t param = VecCoreDefaultCfg
) (
input logic clk,
input logic rst_n,
input logic IN_weight_valid,
input logic [param.weight_width - 1:0] IN_weight[0:param.weight_cnt-1],
input logic fmap_ready,
input logic [16:0] IN_num_recur,
input logic IN_activated_lane,
input logic [param.fixed_mant_width+2:0] IN_fmap_LUT [0:param.fmap_cache_out_cnt-1][0:param.weight_width-1],
output logic [param.fixed_mant_width+2:0] OUT_GEMV_result_vector[0:param.gemv_batch-1],
output logic OUT_valid
);
logic [param.fixed_mant_width+2:0] reduction_result_wire;
logic reduction_res_valid_wire;
GEMV_reduction #(
.fmap_cache_out_cnt(param.fmap_cache_out_cnt),
.weight_type(param.weight_width),
.line_cnt(line_cnt)
) u_GEMV_reduction (
.clk (clk),
.rst_n(rst_n),
.IN_fmap_LUT(IN_fmap_LUT),
.IN_valid(IN_weight_valid),
.IN_is_lane_active(IN_activated_lane),
.IN_weight(IN_weight),
.OUT_reduction_result(reduction_result_wire),
.OUT_reduction_res_valid(reduction_res_valid_wire)
);
GEMV_accumulate #(
.param(VecCoreDefaultCfg)
) u_GEMV_accumulate (
.clk (clk),
.rst_n(rst_n),
.IN_reduction_result(reduction_result_wire),
.init(fmap_ready),
.IN_valid(reduction_res_valid_wire),
.IN_num_recur(IN_num_recur),
.OUT_GEMV_result_vector(OUT_GEMV_result_vector),
.OUT_acc_valid(OUT_valid)
);
endmodule
GEMV_reduction.sv
`timescale 1ns / 1ps
`include "GEMV_Vec_Matrix_MUL.svh"
`include "GLOBAL_CONST.svh"
module GEMV_reduction
import vec_core_pkg::*;
#(
parameter gemv_cfg_t param = VecCoreDefaultCfg,
parameter int REDUCTION_LATENCY = 5
) (
input logic clk,
input logic rst_n,
input logic IN_is_lane_active,
input logic IN_valid,
input logic [param.fixed_mant_width+2:0] IN_fmap_LUT[0:param.param.fmap_cache_out_cnt-1][0:param.weight_width-1],
input logic [param.weight_width - 1:0] IN_weight[0:param.weight_cnt -1],
output logic [param.fixed_mant_width+2:0] OUT_reduction_result,
output logic OUT_reduction_res_valid
);
// ===| pipeline Valid sync REG |===
// REDUCTION_LATENCY(5) Shift Register
logic [REDUCTION_LATENCY-1:0] valid_pipe;
always_ff @(posedge clk) begin
if (!rst_n) begin
valid_pipe <= '0;
end else begin
// Push the valid status into the LSB and shift every cycle
//(Active only when IN_valid and IN_is_lane_active are both 1).
valid_pipe <= {valid_pipe[REDUCTION_LATENCY-2:0], (IN_valid & IN_is_lane_active)};
end
end
assign OUT_reduction_res_valid = valid_pipe[REDUCTION_LATENCY-1];
//2^5
logic [param.fixed_mant_width+2:0] reduction_32_fmap_wire[0:31];
//2^4
logic [param.fixed_mant_width+2:0] reduction_16_fmap_wire[0:15];
//2^3
logic [param.fixed_mant_width+2:0] reduction_8_fmap_wire [ 0:7];
//2^2
logic [param.fixed_mant_width+2:0] reduction_4_fmap_wire [ 0:3];
//2^1
logic [param.fixed_mant_width+2:0] reduction_2_fmap_wire [ 0:1];
always_ff @(posedge clk) begin
if (!rst_n) begin
for (int lane = 0; lane < param.fmap_cache_out_cnt; lane++) begin
//stage1_emax_q1[i] <= 0
end
end else begin
if (IN_valid & IN_is_lane_active) begin
for (int lane = 0; lane < param.fmap_cache_out_cnt; lane++) begin
reduction_32_fmap_wire[lane] <= IN_fmap_LUT[lane][IN_weight[lane]];
end
end
end
end
// ===| Stage 1: Reduction 32 -> 16 |==========================================
// Instantiates 16 DSP48E2 slices to add adjacent pairs of the 32 input wires
// ============================================================================
generate
genvar i;
for (i = 0; i < 16; i++) begin : gen_dsp_reduce_32_to_16
// --- Internal 48-bit wires for DSP port matching ---
logic [47:0] dsp_in_ab;
logic [47:0] dsp_in_c;
logic [47:0] dsp_out_p;
// Map inputs to 48-bit width (Zero or Sign extension depending on your data)
// Operand 1: Even index (2*i) -> Routed to A:B ports
// Operand 2: Odd index (2*i+1) -> Routed to C port
assign dsp_in_ab = 48'(reduction_32_fmap_wire[2*i]);
assign dsp_in_c = 48'(reduction_32_fmap_wire[2*i+1]);
DSP48E2 #(
// [IMPORTANT] Changed from "TWO24" to "ONE48" for standard addition.
// "TWO24" breaks the carry chain at bit 24. Use "ONE48" for full precision.
.USE_SIMD("ONE48"),
// --- Register Control (0 = Comb, 1 = Registered) ---
.AREG(0),
.BREG(0),
.CREG(0),
.PREG(1) // Enable P register (1 clock delay for sum)
) u_dsp (
// --- Clock and Reset ---
.CLK (clk),
.RSTP(~rst_n), // Reset for P register (Active High inside DSP)
// --- Operation Mode (Fixed for A:B + C) ---
.ALUMODE(4'b0000), // 0000 = ADD
.INMODE (5'b00000), // Default A and B routing
.OPMODE (9'b000_00_11_11), // Z=0, W=0, Y=C, X=A:B => P = A:B + C
// --- Clock Enables (Tie to high for continuous pipeline) ---
.CEP(1'b1), // Enable P register updates
// --- Data Inputs ---
.A(dsp_in_ab[47:18]), // Upper 30 bits go to A port
.B(dsp_in_ab[17:0]), // Lower 18 bits go to B port
.C(dsp_in_c), // 48 bits go to C port
// --- Data Output ---
.P(dsp_out_p) // 48-bit Result
);
// --- Truncate 48-bit result back to parameterized wire width ---
assign reduction_16_fmap_wire[i] = dsp_out_p[param.fmap_cache_out_cnt+2:0];
end
endgenerate
// ============================================================================
// ===| REDUCTION TREE: LUT-based Pipelined Adders (Optimized for 400MHz) |====
// UltraScale+ CARRY8 primitives combined with immediate FDRE (Registers)
// provide better routing and timing than forcing DSPs for simple additions.
// ============================================================================
// ===| Stage 2: Reduction 16 -> 8 |===========================================
// Latency: 1 Clock Cycle
always_ff @(posedge clk) begin
if (!rst_n) begin
for (int i = 0; i < 8; i++) begin
reduction_8_fmap_wire[i] <= '0;
end
end else begin
for (int i = 0; i < 8; i++) begin
// Simple addition. Vivado will infer CARRY8 + FF in the same slice.
reduction_8_fmap_wire[i] <= reduction_16_fmap_wire[2*i] + reduction_16_fmap_wire[2*i+1];
end
end
end
// ===| Stage 3: Reduction 8 -> 4 |============================================
// Latency: 1 Clock Cycle
always_ff @(posedge clk) begin
if (!rst_n) begin
for (int i = 0; i < 4; i++) begin
reduction_4_fmap_wire[i] <= '0;
end
end else begin
for (int i = 0; i < 4; i++) begin
reduction_4_fmap_wire[i] <= reduction_8_fmap_wire[2*i] + reduction_8_fmap_wire[2*i+1];
end
end
end
// ===| Stage 4: Reduction 4 -> 2 |============================================
// Latency: 1 Clock Cycle
always_ff @(posedge clk) begin
if (!rst_n) begin
for (int i = 0; i < 2; i++) begin
reduction_2_fmap_wire[i] <= '0;
end
end else begin
for (int i = 0; i < 2; i++) begin
reduction_2_fmap_wire[i] <= reduction_4_fmap_wire[2*i] + reduction_4_fmap_wire[2*i+1];
end
end
end
// ===| Stage 5: Reduction 2 -> 1 (Final Sum) |================================
// Latency: 1 Clock Cycle
always_ff @(posedge clk) begin
if (!rst_n) begin
OUT_reduction_result <= '0;
end else begin
OUT_reduction_result <= reduction_2_fmap_wire[0] + reduction_2_fmap_wire[1];
end
end
// ============================================================================
// Total Pipeline Latency for Reduction Tree:
// Stage 1 (DSP: 32->16) : 1 Cycle (or more depending on DSP PREG/MREG configs)
// Stage 2 (LUT: 16->8) : 1 Cycle
// Stage 3 (LUT: 8->4) : 1 Cycle
// Stage 4 (LUT: 4->2) : 1 Cycle
// Stage 5 (LUT: 2->1) : 1 Cycle
// ----------------------------------------
// Total latency after FMap/Weight input: 5 Cycles
// ============================================================================
endmodule
GEMV_accumulate.sv
`timescale 1ns / 1ps
`include "GEMV_Vec_Matrix_MUL.svh"
`include "GLOBAL_CONST.svh"
// GEMV Operation
//
// [Shape]
// vector (1, N)
// dot Matrix (N, M)
// result (1, M)
//
// calc/per clk: Vec(1,32) dot Mat(32,32) = 32
// gemv_cycle: 512 clk
// Throughput/per clk: 1
// Throughput/per-GEMV: 512 (1p/clk * 512clk)
// GEMV-PIPE-CNT: 4
// 512 * 4 = 2048
// GEMV Throughput per [cycle]: (1, 2048)
//
// ===| Warning |===
// Before we send result to ACP
// results must type-casted FP32 to BF16(2Byte)
// and find e-Max and align by 32Groups
module GEMV_accumulate
import vec_core_pkg::*;
#(
parameter gemv_cfg_t param = VecCoreDefaultCfg
) (
input logic clk,
input logic rst_n,
input logic [dtype_pkg::FixedMantWidth+2:0] IN_reduction_result,
input logic init,
input logic IN_valid,
input logic [16:0] IN_num_recur,
output logic [dtype_pkg::FixedMantWidth+2:0] OUT_GEMV_result_vector[0:param.gemv_batch - 1],
output logic OUT_acc_valid
);
logic [dtype_pkg::FixedMantWidth+2:0] GEMV_result_vector[0:param.gemv_batch - 1];
// 2^9 == 512
logic [8:0] res_vec_idx;
logic [16:0] num_recur;
logic [11:0] index_of_result;
// Preventing flip-flop replication: Direct-wire the internal accumulation register to the output port
assign OUT_GEMV_result_vector = GEMV_result_vector;
always_ff @(posedge clk) begin
if (!rst_n) begin
OUT_acc_valid <= 0;
res_vec_idx <= 0;
num_recur <= 0;
for (int vec_idx = 0; vec_idx < param.gemv_batch; vec_idx++) begin
GEMV_result_vector[vec_idx] <= '0;
end
end else if (init) begin
// new GEMV Acc start init pipeline
res_vec_idx <= 0;
num_recur <= IN_num_recur;
end else begin
OUT_acc_valid <= 0;
if (IN_valid && ~OUT_acc_valid) begin
GEMV_result_vector[res_vec_idx] <= GEMV_result_vector[res_vec_idx] + IN_reduction_result;
// Modulo-2^N counter": Intended Overflow 509-> 510-> 511-> 0-> 1
res_vec_idx <= res_vec_idx + 1;
num_recur <= num_recur - 1;
end
if (num_recur == 0 && ~OUT_acc_valid) begin
OUT_acc_valid <= 1;
end
end
end
endmodule