벡터 코어 (GEMV)

4 개의 병렬 μV-core. 각 코어는 자기 L1 캐시 슬라이스에서 활성화를, HP2/HP3 (포트당 32 INT4/clk) 에서 가중치를 읽습니다. 부분곱은 3 단 reduction tree 로, 이후 Emax 정렬 BF16 누산기로 들어갑니다. 단일 토큰 디코드 경로가 가중치 대역폭에 제한되는 자기회귀 디코딩에서 이 엔진이 주역입니다.

더 보기

pccx: Parallel Compute Core eXecutor

μV-core 위치를 보여주는 v001 블록도.

모듈

  • GEMV_top.sv — μV-core 래퍼. 4 레인을 인스턴스화하고 가중치 FIFO + L1 캐시 슬라이스에 연결.

  • GEMV_generate_lut.sv — 레인별 LUT 로 INT4 가중치 디코드 / 부호 확장.

  • GEMV_Vec_Matrix_MUL.svh — multiply-reduce 단계용 매개변수 헤더.

  • GEMV_reduction_branch.sv — reduction tree 의 한 가지 (부분곱 쌍 합산).

  • GEMV_reduction.sv — 4 가지를 스칼라로 합치는 최상위 reduction tree.

  • GEMV_accumulate.sv — 최종 스칼라를 받는 Emax 정렬 BF16 누산기.

소스

GEMV_top.sv
`timescale 1ns / 1ps

`include "GEMV_Vec_Matrix_MUL.svh"
`include "GLOBAL_CONST.svh"

// weight size = 4bit
// feature_map size =  bf16
module GEMV_top
  import vec_core_pkg::*;
#(
    parameter gemv_cfg_t param = VecCoreDefaultCfg,
    parameter A = 0,
    parameter B = 1,
    parameter C = 2,
    parameter D = 3
) (
    input logic clk,
    input logic rst_n,

    input logic IN_weight_valid_A,
    input logic IN_weight_valid_B,
    input logic IN_weight_valid_C,
    input logic IN_weight_valid_D,

    input logic [param.weight_width - 1:0] IN_weight_A[0:param.weight_cnt -1],
    input logic [param.weight_width - 1:0] IN_weight_B[0:param.weight_cnt -1],
    input logic [param.weight_width - 1:0] IN_weight_C[0:param.weight_cnt -1],
    input logic [param.weight_width - 1:0] IN_weight_D[0:param.weight_cnt -1],

    output logic OUT_weight_ready_A,
    output logic OUT_weight_ready_B,
    output logic OUT_weight_ready_C,
    output logic OUT_weight_ready_D,

    input logic [param.fixed_mant_width-1:0] IN_fmap_broadcast      [0:param.fmap_cache_out_cnt-1],
    input logic                              IN_fmap_broadcast_valid,

    input logic [16:0] IN_num_recur,
    // e_max (from Cache for Normalization alignment)
    input logic [dtype_pkg::Bf16ExpWidth-1:0] IN_cached_emax_out[0:param.fmap_cache_out_cnt-1],

    input logic IN_activated_lane[0:param.num_gemv_pipeline-1],

    output logic [param.fmap_type_mixed_precision - 1:0] OUT_final_fmap_A,
    output logic [param.fmap_type_mixed_precision - 1:0] OUT_final_fmap_B,
    output logic [param.fmap_type_mixed_precision - 1:0] OUT_final_fmap_C,
    output logic [param.fmap_type_mixed_precision - 1:0] OUT_final_fmap_D,

    output logic OUT_result_valid_A,
    output logic OUT_result_valid_B,
    output logic OUT_result_valid_C,
    output logic OUT_result_valid_D
);

  logic [param.fixed_mant_width+2:0] fmap_LUT_wire[0:param.fmap_cache_out_cnt-1][0:param.weight_width-1];

  logic fmap_ready_wire;

  GEMV_generate_lut #(
      .param(VecCoreDefaultCfg)
  ) u_GEMV_generate_lut (
      .IN_fmap_broadcast(IN_fmap_broadcast),
      .IN_fmap_broadcast_valid(IN_fmap_broadcast_valid),
      .IN_cached_emax_out(IN_cached_emax_out),

      .OUT_fmap_LUT  (fmap_LUT_wire),
      .OUT_fmap_ready(fmap_ready_wire)
  );


  GEMV_reduction_branch #(
      .param(VecCoreDefaultCfg)
  ) u_GEMV_reduction_branch_A (
      .clk  (clk),
      .rst_n(rst_n),

      .IN_weight_valid(IN_weight_valid_A),
      .IN_weight(IN_weight_A),

      .fmap_ready(fmap_ready_wire),
      .IN_num_recur(IN_num_recur),  // shape x * y * z

      .IN_activated_lane(IN_activated_lane[A]),
      .IN_fmap_LUT(fmap_LUT_wire),

      .OUT_GEMV_result_vector(OUT_final_fmap_A),
      .OUT_valid(OUT_result_valid_A)
  );


  GEMV_reduction_branch #(
      .param(VecCoreDefaultCfg)
  ) u_GEMV_reduction_branch_B (
      .clk  (clk),
      .rst_n(rst_n),

      .IN_weight_valid(IN_weight_valid_B),
      .IN_weight(IN_weight_B),

      .fmap_ready(fmap_ready_wire),
      .IN_num_recur(IN_num_recur),  // shape x * y * z

      .IN_activated_lane(IN_activated_lane[B]),
      .IN_fmap_LUT(fmap_LUT_wire),

      .OUT_GEMV_result_vector(OUT_final_fmap_B),
      .OUT_valid(OUT_result_valid_B)
  );

  GEMV_reduction_branch #(
      .param(VecCoreDefaultCfg)
  ) u_GEMV_reduction_branch_C (
      .clk  (clk),
      .rst_n(rst_n),

      .IN_weight_valid(IN_weight_valid_C),
      .IN_weight(IN_weight_C),

      .fmap_ready(fmap_ready_wire),
      .IN_num_recur(IN_num_recur),  // shape x * y * z

      .IN_activated_lane(IN_activated_lane[C]),
      .IN_fmap_LUT(fmap_LUT_wire),

      .OUT_GEMV_result_vector(OUT_final_fmap_C),
      .OUT_valid(OUT_result_valid_C)
  );

  GEMV_reduction_branch #(
      .param(VecCoreDefaultCfg)
  ) u_GEMV_reduction_branch_D (
      .clk  (clk),
      .rst_n(rst_n),

      .IN_weight_valid(IN_weight_valid_D),
      .IN_weight(IN_weight_D),

      .fmap_ready(fmap_ready_wire),
      .IN_num_recur(IN_num_recur),  // shape x * y * z

      .IN_activated_lane(IN_activated_lane[D]),
      .IN_fmap_LUT(fmap_LUT_wire),

      .OUT_GEMV_result_vector(OUT_final_fmap_D),
      .OUT_valid(OUT_result_valid_D)
  );

endmodule
GEMV_generate_lut.sv
`timescale 1ns / 1ps

`include "GEMV_Vec_Matrix_MUL.svh"
`include "GLOBAL_CONST.svh"

// Descending order

module GEMV_generate_lut
  import vec_core_pkg::*;
#(
    parameter gemv_cfg_t param = VecCoreDefaultCfg
) (
    input logic [param.fixed_mant_width-1:0] IN_fmap_broadcast      [0:param.fmap_cache_out_cnt-1],
    input logic                              IN_fmap_broadcast_valid,

    // e_max (from Cache for Normalization alignment)
    input logic [device_pkg::FmapType-1:0] IN_cached_emax_out[0:param.fmap_cache_out_cnt-1],
    output logic [param.param.fixed_mant_width+2:0] OUT_fmap_LUT[0:param.fmap_cache_out_cnt-1][0:param.weight_width-1],
    output logic OUT_fmap_ready
);
  genvar idx, w;
  generate
    for (idx = 0; idx < param.fmap_cache_out_cnt; idx++) begin : fmap_lut_pre_cal
      wire signed [29:0] F;
      assign F = {{3{IN_fmap_broadcast[idx][26]}}, IN_fmap_broadcast[idx]};

      for (w = 0; w < 16; w++) begin : lut_entry
        // w - 8 = INT4 range (-8 ~ 7)
        assign OUT_fmap_low_LUT[idx][w] = F * $signed(5'(w) - 5'd8);
      end
    end
  endgenerate
endmodule
GEMV_Vec_Matrix_MUL.svh
//`define //
`define WEIGHT_HP_PORT_SIZE 512
`define FEATURE_MAP_HPC_PORT_SIZE 256

//`define INPUT_weight
`define IS_NEGATIVE_NUMBER 1
`define WEIGHT_SIZE 4
`define FEAUTRE_MAP_SIZE 16
`define AXI_WEIGHT_PORT_CNT 4


`define GEMV_LOW 0
`define GEMV_HIGH 1


`define GEMV_MAX_RES_VEC 2048

`define GEMV_QUARTER_ONE 0
`define GEMV_QUARTER_TWO 1
`define GEMV_QUARTER_THREE 2
`define GEMV_QUARTER_FOUR 3
GEMV_reduction_branch.sv
`timescale 1ns / 1ps

`include "GEMV_Vec_Matrix_MUL.svh"
`include "GLOBAL_CONST.svh"

module GEMV_reduction_branch
  import vec_core_pkg::*;
#(
    parameter gemv_cfg_t param = VecCoreDefaultCfg
) (
    input logic clk,
    input logic rst_n,

    input logic IN_weight_valid,
    input logic [param.weight_width - 1:0] IN_weight[0:param.weight_cnt-1],

    input logic fmap_ready,
    input logic [16:0] IN_num_recur,

    input logic IN_activated_lane,
    input logic [param.fixed_mant_width+2:0] IN_fmap_LUT [0:param.fmap_cache_out_cnt-1][0:param.weight_width-1],

    output logic [param.fixed_mant_width+2:0] OUT_GEMV_result_vector[0:param.gemv_batch-1],
    output logic OUT_valid
);

  logic [param.fixed_mant_width+2:0] reduction_result_wire;

  logic reduction_res_valid_wire;

  GEMV_reduction #(
      .fmap_cache_out_cnt(param.fmap_cache_out_cnt),
      .weight_type(param.weight_width),
      .line_cnt(line_cnt)
  ) u_GEMV_reduction (
      .clk  (clk),
      .rst_n(rst_n),

      .IN_fmap_LUT(IN_fmap_LUT),
      .IN_valid(IN_weight_valid),

      .IN_is_lane_active(IN_activated_lane),
      .IN_weight(IN_weight),

      .OUT_reduction_result(reduction_result_wire),
      .OUT_reduction_res_valid(reduction_res_valid_wire)
  );

  GEMV_accumulate #(
      .param(VecCoreDefaultCfg)
  ) u_GEMV_accumulate (
      .clk  (clk),
      .rst_n(rst_n),

      .IN_reduction_result(reduction_result_wire),
      .init(fmap_ready),

      .IN_valid(reduction_res_valid_wire),
      .IN_num_recur(IN_num_recur),
      .OUT_GEMV_result_vector(OUT_GEMV_result_vector),
      .OUT_acc_valid(OUT_valid)
  );

endmodule
GEMV_reduction.sv
`timescale 1ns / 1ps

`include "GEMV_Vec_Matrix_MUL.svh"
`include "GLOBAL_CONST.svh"

module GEMV_reduction
  import vec_core_pkg::*;
#(
    parameter gemv_cfg_t param = VecCoreDefaultCfg,
    parameter int REDUCTION_LATENCY = 5
) (
    input logic clk,
    input logic rst_n,
    input logic IN_is_lane_active,
    input logic IN_valid,
    input logic [param.fixed_mant_width+2:0] IN_fmap_LUT[0:param.param.fmap_cache_out_cnt-1][0:param.weight_width-1],
    input logic [param.weight_width - 1:0] IN_weight[0:param.weight_cnt -1],

    output logic [param.fixed_mant_width+2:0] OUT_reduction_result,
    output logic OUT_reduction_res_valid
);
  // ===| pipeline Valid sync REG |===
  // REDUCTION_LATENCY(5) Shift Register
  logic [REDUCTION_LATENCY-1:0] valid_pipe;

  always_ff @(posedge clk) begin
    if (!rst_n) begin
      valid_pipe <= '0;
    end else begin
      // Push the valid status into the LSB and shift every cycle
      //(Active only when IN_valid and IN_is_lane_active are both 1).
      valid_pipe <= {valid_pipe[REDUCTION_LATENCY-2:0], (IN_valid & IN_is_lane_active)};
    end
  end

  assign OUT_reduction_res_valid = valid_pipe[REDUCTION_LATENCY-1];

  //2^5
  logic [param.fixed_mant_width+2:0] reduction_32_fmap_wire[0:31];
  //2^4
  logic [param.fixed_mant_width+2:0] reduction_16_fmap_wire[0:15];
  //2^3
  logic [param.fixed_mant_width+2:0] reduction_8_fmap_wire [ 0:7];
  //2^2
  logic [param.fixed_mant_width+2:0] reduction_4_fmap_wire [ 0:3];
  //2^1
  logic [param.fixed_mant_width+2:0] reduction_2_fmap_wire [ 0:1];

  always_ff @(posedge clk) begin
    if (!rst_n) begin
      for (int lane = 0; lane < param.fmap_cache_out_cnt; lane++) begin
        //stage1_emax_q1[i] <= 0
      end
    end else begin
      if (IN_valid & IN_is_lane_active) begin
        for (int lane = 0; lane < param.fmap_cache_out_cnt; lane++) begin
          reduction_32_fmap_wire[lane] <= IN_fmap_LUT[lane][IN_weight[lane]];
        end
      end
    end
  end

  // ===| Stage 1: Reduction 32 -> 16 |==========================================
  // Instantiates 16 DSP48E2 slices to add adjacent pairs of the 32 input wires
  // ============================================================================
  generate
    genvar i;
    for (i = 0; i < 16; i++) begin : gen_dsp_reduce_32_to_16

      // --- Internal 48-bit wires for DSP port matching ---
      logic [47:0] dsp_in_ab;
      logic [47:0] dsp_in_c;
      logic [47:0] dsp_out_p;

      // Map inputs to 48-bit width (Zero or Sign extension depending on your data)
      // Operand 1: Even index (2*i) -> Routed to A:B ports
      // Operand 2: Odd index (2*i+1) -> Routed to C port
      assign dsp_in_ab = 48'(reduction_32_fmap_wire[2*i]);
      assign dsp_in_c  = 48'(reduction_32_fmap_wire[2*i+1]);

      DSP48E2 #(
          // [IMPORTANT] Changed from "TWO24" to "ONE48" for standard addition.
          // "TWO24" breaks the carry chain at bit 24. Use "ONE48" for full precision.
          .USE_SIMD("ONE48"),

          // --- Register Control (0 = Comb, 1 = Registered) ---
          .AREG(0),
          .BREG(0),
          .CREG(0),
          .PREG(1)   // Enable P register (1 clock delay for sum)
      ) u_dsp (
          // --- Clock and Reset ---
          .CLK (clk),
          .RSTP(~rst_n), // Reset for P register (Active High inside DSP)

          // --- Operation Mode (Fixed for A:B + C) ---
          .ALUMODE(4'b0000),         // 0000 = ADD
          .INMODE (5'b00000),        // Default A and B routing
          .OPMODE (9'b000_00_11_11), // Z=0, W=0, Y=C, X=A:B  =>  P = A:B + C

          // --- Clock Enables (Tie to high for continuous pipeline) ---
          .CEP(1'b1),  // Enable P register updates

          // --- Data Inputs ---
          .A(dsp_in_ab[47:18]),  // Upper 30 bits go to A port
          .B(dsp_in_ab[17:0]),   // Lower 18 bits go to B port
          .C(dsp_in_c),          // 48 bits go to C port

          // --- Data Output ---
          .P(dsp_out_p)  // 48-bit Result
      );

      // --- Truncate 48-bit result back to parameterized wire width ---
      assign reduction_16_fmap_wire[i] = dsp_out_p[param.fmap_cache_out_cnt+2:0];
    end
  endgenerate
  // ============================================================================
  // ===| REDUCTION TREE: LUT-based Pipelined Adders (Optimized for 400MHz) |====
  // UltraScale+ CARRY8 primitives combined with immediate FDRE (Registers)
  // provide better routing and timing than forcing DSPs for simple additions.
  // ============================================================================

  // ===| Stage 2: Reduction 16 -> 8 |===========================================
  // Latency: 1 Clock Cycle
  always_ff @(posedge clk) begin
    if (!rst_n) begin
      for (int i = 0; i < 8; i++) begin
        reduction_8_fmap_wire[i] <= '0;
      end
    end else begin
      for (int i = 0; i < 8; i++) begin
        // Simple addition. Vivado will infer CARRY8 + FF in the same slice.
        reduction_8_fmap_wire[i] <= reduction_16_fmap_wire[2*i] + reduction_16_fmap_wire[2*i+1];
      end
    end
  end

  // ===| Stage 3: Reduction 8 -> 4 |============================================
  // Latency: 1 Clock Cycle
  always_ff @(posedge clk) begin
    if (!rst_n) begin
      for (int i = 0; i < 4; i++) begin
        reduction_4_fmap_wire[i] <= '0;
      end
    end else begin
      for (int i = 0; i < 4; i++) begin
        reduction_4_fmap_wire[i] <= reduction_8_fmap_wire[2*i] + reduction_8_fmap_wire[2*i+1];
      end
    end
  end

  // ===| Stage 4: Reduction 4 -> 2 |============================================
  // Latency: 1 Clock Cycle
  always_ff @(posedge clk) begin
    if (!rst_n) begin
      for (int i = 0; i < 2; i++) begin
        reduction_2_fmap_wire[i] <= '0;
      end
    end else begin
      for (int i = 0; i < 2; i++) begin
        reduction_2_fmap_wire[i] <= reduction_4_fmap_wire[2*i] + reduction_4_fmap_wire[2*i+1];
      end
    end
  end

  // ===| Stage 5: Reduction 2 -> 1 (Final Sum) |================================
  // Latency: 1 Clock Cycle
  always_ff @(posedge clk) begin
    if (!rst_n) begin
      OUT_reduction_result <= '0;
    end else begin
      OUT_reduction_result <= reduction_2_fmap_wire[0] + reduction_2_fmap_wire[1];
    end
  end

  // ============================================================================
  // Total Pipeline Latency for Reduction Tree:
  // Stage 1 (DSP: 32->16) : 1 Cycle (or more depending on DSP PREG/MREG configs)
  // Stage 2 (LUT: 16->8)  : 1 Cycle
  // Stage 3 (LUT: 8->4)   : 1 Cycle
  // Stage 4 (LUT: 4->2)   : 1 Cycle
  // Stage 5 (LUT: 2->1)   : 1 Cycle
  // ----------------------------------------
  // Total latency after FMap/Weight input: 5 Cycles
  // ============================================================================

endmodule
GEMV_accumulate.sv
`timescale 1ns / 1ps

`include "GEMV_Vec_Matrix_MUL.svh"
`include "GLOBAL_CONST.svh"

// GEMV Operation
//
// [Shape]
// vector (1, N)
// dot Matrix (N, M)
// result (1, M)
//
// calc/per clk: Vec(1,32) dot Mat(32,32) = 32
// gemv_cycle: 512 clk
// Throughput/per clk: 1
// Throughput/per-GEMV: 512 (1p/clk * 512clk)
// GEMV-PIPE-CNT: 4
// 512 * 4 = 2048
// GEMV Throughput per [cycle]: (1, 2048)
//
// ===| Warning |===
// Before we send result to ACP
// results must type-casted FP32 to BF16(2Byte)
// and find e-Max and align by 32Groups



module GEMV_accumulate
  import vec_core_pkg::*;
#(
    parameter gemv_cfg_t param = VecCoreDefaultCfg
) (
    input logic clk,
    input logic rst_n,
    input logic [dtype_pkg::FixedMantWidth+2:0] IN_reduction_result,

    input logic init,
    input logic IN_valid,

    input logic [16:0] IN_num_recur,

    output logic [dtype_pkg::FixedMantWidth+2:0] OUT_GEMV_result_vector[0:param.gemv_batch - 1],
    output logic OUT_acc_valid
);

  logic [dtype_pkg::FixedMantWidth+2:0] GEMV_result_vector[0:param.gemv_batch - 1];

  // 2^9 == 512
  logic [8:0] res_vec_idx;
  logic [16:0] num_recur;
  logic [11:0] index_of_result;

  // Preventing flip-flop replication: Direct-wire the internal accumulation register to the output port
  assign OUT_GEMV_result_vector = GEMV_result_vector;

  always_ff @(posedge clk) begin
    if (!rst_n) begin
      OUT_acc_valid <= 0;
      res_vec_idx   <= 0;
      num_recur     <= 0;
      for (int vec_idx = 0; vec_idx < param.gemv_batch; vec_idx++) begin
        GEMV_result_vector[vec_idx] <= '0;
      end
    end else if (init) begin
      // new GEMV Acc start init pipeline
      res_vec_idx <= 0;
      num_recur   <= IN_num_recur;
    end else begin

      OUT_acc_valid <= 0;

      if (IN_valid && ~OUT_acc_valid) begin
        GEMV_result_vector[res_vec_idx] <= GEMV_result_vector[res_vec_idx] + IN_reduction_result;

        // Modulo-2^N counter": Intended Overflow 509-> 510-> 511-> 0-> 1
        res_vec_idx <= res_vec_idx + 1;
        num_recur <= num_recur - 1;
      end

      if (num_recur == 0 && ~OUT_acc_valid) begin
        OUT_acc_valid <= 1;
      end
    end
  end

endmodule