전처리¶
전처리 스테이지는 ACP 로 호스트에서 들어오는 원시 BF16 피처맵을 받아 DSP48E2 MAC 이 이해하는 27 비트 고정소수점 표현으로 변환하고, 코어별 L1 캐시에 올립니다.
더 보기
- pccx: Parallel Compute Core eXecutor
최상위 다이어그램의
preprocess_fmap박스.
모듈¶
preprocess_fmap.sv— 행렬·벡터 코어 양쪽에 공급하는 전처리 최상위 래퍼.fmap_cache.sv— L1 피처맵 캐시. 소비자당 1 슬라이스.preprocess_bf16_fixed_pipeline.sv— BF16 → 27 비트 고정소수점 변환 파이프라인 (Emax 정렬 포함).
소스¶
preprocess_fmap.sv
`include "GLOBAL_CONST.svh"
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "npu_interfaces.svh"
/**
* Module: gemm_fmap_preprocessor
*
* Role:
* - Combined 256-bit FMap streaming from HPC0/HPC1.
* - e_max (Exponent) extraction and caching for BFP.
* - Mantissa shifting to Fixed-point.
* - SRAM Caching for broadcasting to multiple compute engines (Branch point).
*/
module preprocess_fmap #(
parameter fmap_width = `ACP_PORT_IN
) (
input logic clk,
input logic rst_n,
input logic i_clear,
// AXI4-Stream Interfaces from ACP
axis_if.slave S_AXIS_ACP_FMAP, // ACP (128-bit)
// Control from Brain
input logic i_rd_start,
// Output to Branch Engines (Systolic / GEMV / CVO)
output logic [`FIXED_MANT_WIDTH-1:0] o_fmap_broadcast[0:`ARRAY_SIZE_H-1],
output logic o_fmap_valid,
output logic [`BF16_EXP_WIDTH-1:0] o_cached_emax[0:`ARRAY_SIZE_H-1]
);
// ===| Bridge & Alignment: 256-bit Feature Map |=======
//logic [`ACP_PORT_IN:0] s_axis_fmap_combined_tdata;
//logic s_axis_fmap_combined_tvalid;
//logic s_axis_fmap_combined_tready;
//assign s_axis_fmap_combined_tdata = S_AXIS_ACP_FMAP.tdata;
//assign s_axis_fmap_combined_tvalid = S_AXIS_FMAP0.tvalid & S_AXIS_FMAP1.tvalid;
//assign S_AXIS_FMAP0.tready = s_axis_fmap_combined_tready & S_AXIS_FMAP1.tvalid;
//assign S_AXIS_FMAP1.tready = s_axis_fmap_combined_tready & S_AXIS_FMAP0.tvalid;
// 256-bit FIFO for FMap
logic [fmap_width:0] fmap_fifo_data;
logic fmap_fifo_valid;
logic fmap_fifo_ready;
xpm_fifo_axis #(
.FIFO_DEPTH(`XPM_FIFO_DEPTH),
.TDATA_WIDTH(256),
.FIFO_MEMORY_TYPE("block"),
.CLOCKING_MODE("common_clock")
) u_fmap_fifo (
.s_aclk(clk),
.m_aclk(clk),
.s_aresetn(rst_n),
.s_axis_tdata(S_AXIS_ACP_FMAP.tdata),
.s_axis_tvalid(S_AXIS_ACP_FMAP.tvalid),
.s_axis_tready(S_AXIS_ACP_FMAP.tready),
.m_axis_tdata(fmap_fifo_data),
.m_axis_tvalid(fmap_fifo_valid),
.m_axis_tready(fmap_fifo_ready)
);
// ===| e_max parsing & cache logic |=======
logic [`BF16_EXP_WIDTH-1:0] active_emax[0:`ARRAY_SIZE_H-1];
logic fmap_word_toggle;
logic emax_group_valid;
always_ff @(posedge clk) begin
if (!rst_n || i_clear) begin
fmap_word_toggle <= 1'b0;
emax_group_valid <= 1'b0;
end else if (fmap_fifo_valid && fmap_fifo_ready) begin
fmap_word_toggle <= ~fmap_word_toggle;
for (int k = 0; k < 16; k++) begin
if (fmap_word_toggle == 1'b0) active_emax[k] <= fmap_fifo_data[(k*16)+7+:8];
else active_emax[k+16] <= fmap_fifo_data[(k*16)+7+:8];
end
emax_group_valid <= (fmap_word_toggle == 1'b1);
end else begin
emax_group_valid <= 1'b0;
end
end
logic [`BF16_EXP_WIDTH-1:0] emax_cache_mem[0:1023][0:`ARRAY_SIZE_H-1];
logic [9:0] emax_wr_addr, emax_rd_addr;
always_ff @(posedge clk) begin
if (!rst_n || i_clear) begin
emax_wr_addr <= 0;
end else if (emax_group_valid) begin
for (int i = 0; i < `ARRAY_SIZE_H; i++) begin
emax_cache_mem[emax_wr_addr][i] <= active_emax[i];
end
emax_wr_addr <= emax_wr_addr + 1;
end
end
always_ff @(posedge clk) begin
if (!rst_n || i_clear) begin
emax_rd_addr <= 0;
end else if (i_rd_start) begin
emax_rd_addr <= 0;
end
for (int i = 0; i < `ARRAY_SIZE_H; i++) begin
o_cached_emax[i] <= emax_cache_mem[emax_rd_addr][i];
end
end
// ===| Mantissa Shifter & SRAM Cache |=======
logic [431:0] fixed_fmap;
logic fixed_fmap_valid;
logic fmap_shifter_ready;
preprocess_bf16_fixed_pipeline u_fmap_shifter (
.clk(clk),
.rst_n(rst_n),
.s_axis_tdata(fmap_fifo_data),
.s_axis_tvalid(fmap_fifo_valid),
.s_axis_tready(fmap_shifter_ready),
.m_axis_tdata(fixed_fmap),
.m_axis_tvalid(fixed_fmap_valid),
.m_axis_tready(1'b1)
);
assign fmap_fifo_ready = fmap_shifter_ready;
logic [6:0] sram_wr_addr;
always_ff @(posedge clk) begin
if (!rst_n || i_clear) sram_wr_addr <= 0;
else if (fixed_fmap_valid) sram_wr_addr <= sram_wr_addr + 1;
end
fmap_cache #(
.DATA_WIDTH(`FIXED_MANT_WIDTH),
.WRITE_LANES(16),
.CACHE_DEPTH(`FMAP_CACHE_DEPTH),
.LANES(`ARRAY_SIZE_H)
) u_fmap_sram (
.clk(clk),
.rst_n(rst_n),
.wr_data(fixed_fmap),
.wr_valid(fixed_fmap_valid),
.wr_addr(sram_wr_addr),
.wr_en(1'b1),
.rd_start(i_rd_start),
.rd_data_broadcast(o_fmap_broadcast),
.rd_valid(o_fmap_valid)
);
endmodule
fmap_cache.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"
/**
* Module: gemm_fmap_cache
* Description:
* SRAM-based Feature Map Cache for Gemma 3N Decode Phase (GEMV).
* - Stores a 1x2048 Feature Map (BF16, converted to 27-bit Mantissas).
* - Write Interface: 432-bit (16 x 27-bit) to support high-bandwidth.
* - Read Interface: 27-bit (1 word) broadcast to 32 Vertical lanes.
*/
module fmap_cache #(
parameter DATA_WIDTH = 27, // Fixed-point Mantissa width
parameter WRITE_LANES = 16, // 16 words per write
parameter CACHE_DEPTH = 2048, // Accommodates 1x2048 vector
parameter LANES = 32 // Number of vertical lanes to feed
) (
input logic clk,
input logic rst_n,
// ===| Write Interface (From BF16-to-Fixed Shifter) |=======
input logic [(DATA_WIDTH*WRITE_LANES)-1:0] wr_data,
input logic wr_valid,
input logic [ 6:0] wr_addr, // log2(2048/16) = 7 bits
input logic wr_en,
// ===| Read Interface (To Staggered Delay Line) |=======
input logic rd_start, // Trigger to start broadcasting
output logic [DATA_WIDTH-1:0] rd_data_broadcast[0:LANES-1], // 32 identical copies
output logic rd_valid
);
// ===| SRAM Instantiation (XPM Memory) |=======
// Using Xilinx Parameterized Macro for True Dual-Port RAM.
// Port A: Write from Shifter, Port B: Read to Systolic Array.
// We need to fetch 32 words at once? Or broadcast 1 word to 32 lanes?
// In GEMV (1x2048 dot 2048x2048):
// A single element of the Feature Map (e.g., FMap[0]) is broadcasted
// across all 32 columns, multiplying with Row 0 of the Weights.
// Next clock, FMap[1] is broadcasted, multiplying with Row 1 of the Weights.
// Therefore, we read ONE word per clock and fan it out to 32 lanes.
logic [DATA_WIDTH-1:0] sram_rd_data;
logic [ 10:0] rd_addr;
logic is_reading;
// True Dual Port RAM inference / XPM macro with Asymmetric Ports
xpm_memory_sdpram #(
.ADDR_WIDTH_A (7), // Write: 128 depth
.ADDR_WIDTH_B (11), // Read: 2048 depth
.AUTO_SLEEP_TIME (0),
.BYTE_WRITE_WIDTH_A (DATA_WIDTH * WRITE_LANES), // Full word write
.CLOCKING_MODE ("common_clock"),
.MEMORY_INIT_FILE ("none"),
.MEMORY_INIT_PARAM ("0"),
.MEMORY_OPTIMIZATION ("true"),
.MEMORY_PRIMITIVE ("block"), // Force BRAM usage
.MEMORY_SIZE (DATA_WIDTH * CACHE_DEPTH), // 27 * 2048 = 55296 bits
.MESSAGE_CONTROL (0),
.READ_DATA_WIDTH_B (DATA_WIDTH), // Read: 27-bit
.READ_LATENCY_B (2), // 2-cycle latency for 400MHz
.USE_EMBEDDED_CONSTRAINT(0),
.USE_MEM_INIT (1),
.WAKEUP_TIME ("disable_sleep"),
.WRITE_DATA_WIDTH_A (DATA_WIDTH * WRITE_LANES), // Write: 432-bit
.WRITE_MODE_B ("read_first")
) u_fmap_bram (
.clka(clk),
.ena(1'b1),
.wea(wr_en & wr_valid),
.addra(wr_addr),
.dina(wr_data),
.injectsbiterra(1'b0),
.injectdbiterra(1'b0),
.clkb(clk),
.enb(is_reading),
.addrb(rd_addr),
.doutb(sram_rd_data),
.sbiterrb(),
.dbiterrb(),
.sleep(1'b0),
.rstb(~rst_n),
.regceb(1'b1)
);
// ===| FSM / Read Controller |=======
// Controls the rd_addr to sweep through the cached 2048 elements
logic rd_valid_pipe_1, rd_valid_pipe_2;
always_ff @(posedge clk) begin
if (!rst_n) begin
is_reading <= 1'b0;
rd_addr <= 11'd0;
rd_valid_pipe_1 <= 1'b0;
rd_valid_pipe_2 <= 1'b0;
rd_valid <= 1'b0;
end else begin
// Start reading when triggered
if (rd_start) begin
is_reading <= 1'b1;
rd_addr <= 11'd0;
end // Increment address while reading
else if (is_reading) begin
if (rd_addr == CACHE_DEPTH - 1) begin
is_reading <= 1'b0; // Stop after 2048 words
end else begin
rd_addr <= rd_addr + 1;
end
end
// Pipeline the valid signal to match BRAM latency
rd_valid_pipe_1 <= is_reading;
rd_valid_pipe_2 <= rd_valid_pipe_1;
rd_valid <= rd_valid_pipe_2;
// Broadcast the read data to all 32 lanes
if (rd_valid_pipe_2) begin
for (int i = 0; i < LANES; i++) begin
rd_data_broadcast[i] <= sram_rd_data;
end
end
end
end
endmodule
preprocess_bf16_fixed_pipeline.sv
`include "GLOBAL_CONST.svh"
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
/**
* Module: gemm_bf16_fixed_pipeline
* Description:
* High-Throughput 16-Lane Pipelined BF16 to Fixed-point Converter.
* - Input: 256-bit (16 x BF16 elements) per clock.
* - Block Size: 32 elements (Takes 2 clocks to receive one block).
* - Operation:
* 1. Finds the Global e_max among the 32 elements.
* 2. Shifts the Mantissas (27-bit) to align with Global e_max.
* - Output: 432-bit (16 x 27-bit Mantissas) per clock.
*/
module preprocess_bf16_fixed_pipeline (
input logic clk,
input logic rst_n,
// AXI-Stream Slave (Input from 256-bit FIFO)
input logic [255:0] s_axis_tdata,
input logic s_axis_tvalid,
output logic s_axis_tready,
// AXI-Stream Master (Output to SRAM Cache - 16 x 27-bit = 432-bit)
output logic [431:0] m_axis_tdata,
output logic m_axis_tvalid,
input logic m_axis_tready
);
// ===| Stage 1: Input Buffering & Local Max Exponent |===
// We need to buffer the first 16 words while waiting for the next 16.
logic [255:0] buffer_low;
logic [ 7:0] local_max_low;
logic first_half_valid;
logic phase; // 0: Expecting Low 16, 1: Expecting High 16
// Combinational Logic to find the maximum exponent among 16 BF16 elements
function automatic logic [7:0] find_max_e_16(input logic [255:0] data);
logic [7:0] max_val = 8'd0;
for (int i = 0; i < 16; i++) begin
if (data[(i*16)+7+:8] > max_val) begin
max_val = data[(i*16)+7+:8];
end
end
return max_val;
endfunction
assign s_axis_tready = 1'b1; // Always ready to sink data in this pipeline design
// Buffer registers for 32 elements
logic [255:0] block_data_low;
logic [255:0] block_data_high;
logic [ 7:0] global_emax;
logic block_valid;
always_ff @(posedge clk) begin
if (!rst_n) begin
phase <= 1'b0;
first_half_valid <= 1'b0;
block_valid <= 1'b0;
end else if (s_axis_tvalid) begin
if (phase == 1'b0) begin
// Store first 16 words and their max exponent
buffer_low <= s_axis_tdata;
local_max_low <= find_max_e_16(s_axis_tdata);
first_half_valid <= 1'b1;
block_valid <= 1'b0;
phase <= 1'b1;
end else begin
// Second 16 words arrived! Combine to form 32-word block.
block_data_low <= buffer_low;
block_data_high <= s_axis_tdata;
// Compare max of low 16 and high 16 to get GLOBAL e_max
automatic logic [7:0] local_max_high = find_max_e_16(s_axis_tdata);
global_emax <= (local_max_low > local_max_high) ? local_max_low : local_max_high;
block_valid <= 1'b1;
phase <= 1'b0; // Reset for next block
end
end else begin
block_valid <= 1'b0;
end
end
// ===| Stage 2: Parallel Shifting (16 Lanes at a time) |===
// To save resources, we will shift the 32 elements over 2 clock cycles.
// Cycle 1: Shift block_data_low
// Cycle 2: Shift block_data_high
logic shift_phase; // 0: shifting low, 1: shifting high
logic [255:0] shift_target_data;
logic [ 7:0] shift_target_emax;
logic shift_trigger;
always_ff @(posedge clk) begin
if (!rst_n) begin
shift_phase <= 1'b0;
shift_trigger <= 1'b0;
end else begin
if (block_valid) begin
// Start shifting process
shift_phase <= 1'b0;
shift_target_data <= block_data_low;
shift_target_emax <= global_emax;
shift_trigger <= 1'b1;
end else if (shift_trigger && shift_phase == 1'b0) begin
// Next cycle, shift the high part
shift_phase <= 1'b1;
shift_target_data <= block_data_high;
// keep shift_target_emax same
shift_trigger <= 1'b1;
end else begin
shift_trigger <= 1'b0;
end
end
end
// The 16 Parallel Shifters (With Sign & 2's Complement Handling)
logic [431:0] shifted_mantissas; // 16 * 27-bit
genvar i;
generate
for (i = 0; i < 16; i++) begin : gen_shifters
logic [15:0] word;
logic sign;
logic [ 7:0] e_val;
logic [ 6:0] m_val;
logic [26:0] base_mant; // 1(implicit) + 7(m) + 12(pad) = 20 bits base
logic [26:0] shifted_mant;
logic [26:0] final_fixed;
logic [ 7:0] delta_e;
assign word = shift_target_data[(i*16)+:16];
assign sign = word[15];
assign e_val = word[14:7];
assign m_val = word[6:0];
// 1. Prepare Magnitude (Add hidden bit)
// We use a 27-bit container. Hidden bit is at [20].
assign base_mant = (e_val == 0) ? {7'b0, 8'h0, m_val, 12'b0} : {7'b0, 8'h1, m_val, 12'b0};
assign delta_e = shift_target_emax - e_val;
// 2. Align by Shifting Right
assign shifted_mant = (delta_e >= 27) ? 27'd0 : (base_mant >> delta_e);
// 3. Convert to 2's Complement if Sign is negative
// This is CRITICAL for signed multiplication and accumulation in the engines.
assign final_fixed = sign ? (~shifted_mant + 1'b1) : shifted_mant;
assign shifted_mantissas[(i*27)+:27] = final_fixed;
end
endgenerate
// ===| Stage 3: Output Register |===
always_ff @(posedge clk) begin
if (!rst_n) begin
m_axis_tvalid <= 1'b0;
m_axis_tdata <= 0;
end else begin
m_axis_tvalid <= shift_trigger;
if (shift_trigger) begin
m_axis_tdata <= shifted_mantissas;
end
end
end
endmodule