메모리 제어¶
호스트 DDR ↔ L2 URAM 캐시 ↔ 코어별 L1 캐시 ↔ CVO 스트림 인제스트 사이의
모든 경로를 맡습니다. 전용 디스패처가 ISA §5 에 열거된 8 개 data_route_e
경로를 중재하고, 두 개의 상수 메모리 어레이가 행렬/벡터 코어에 shape 와
size 디스크립터를 공급합니다.
더 보기
- pccx ISA 사양
OP_MEMCPY·OP_MEMSET인코딩과 라우팅 enum.
최상위 플러밍¶
mem_dispatcher.sv— 중앙 중재기. 사이클마다 8 개 경로 중 하나를 선택해 대응하는 source/destination 쌍을 구동.mem_L2_cache_fmap.sv— 피처맵 전용 L2 URAM 캐시 (114,688 × 128 비트).mem_HP_buffer.sv— HP-AXI 슬레이브와 컴퓨트 코어 가중치 FIFO 사이의 더블 버퍼 큐.mem_CVO_stream_bridge.sv— L2 캐시에서 CVO 코어의 스트리밍 입력으로 가는 브리지.
mem_dispatcher.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"
`include "mem_IO.svh"
import isa_pkg::*;
// ===| Memory Dispatcher |=======================================================
// Translates engine uops into L2 cache commands and routes data streams.
//
// Responsibilities:
// - Shape constant RAM (MEMSET: fmap / weight array shapes)
// - ACP DMA path : host DDR4 ↔ L2 (MEMCPY host↔NPU)
// - NPU burst path : L2 → GEMM fmap / GEMV fmap broadcast
// - CVO stream : L2 → CVO engine input; CVO output → L2
// (via mem_CVO_stream_bridge)
//
// Address convention for L2: 128-bit word units (word N = bytes [16N..16N+15]).
// ===============================================================================
module mem_dispatcher #() (
input logic clk_core,
input logic rst_n_core,
input logic clk_axi,
input logic rst_axi_n,
// ===| AXI-Stream ACP (external) |==========================================
axis_if.slave S_AXIS_ACP_FMAP,
axis_if.master M_AXIS_ACP_RESULT,
// ===| Engine uop inputs |===================================================
input memory_control_uop_t IN_LOAD_uop,
input memory_set_uop_t IN_mem_set_uop,
input cvo_control_uop_t IN_CVO_uop,
input logic IN_cvo_uop_valid,
// ===| CVO streaming ports (to/from CVO_top) |==============================
output logic [15:0] OUT_cvo_data,
output logic OUT_cvo_valid,
input logic IN_cvo_data_ready,
input logic [15:0] IN_cvo_result,
input logic IN_cvo_result_valid,
output logic OUT_cvo_result_ready,
// ===| Status |=============================================================
output logic OUT_fifo_full,
output logic OUT_cvo_busy
);
// ===| FIFO full aggregation |=================================================
logic acp_cmd_fifo_full;
logic npu_cmd_fifo_full;
logic cvo_bridge_busy;
assign OUT_fifo_full = acp_cmd_fifo_full | npu_cmd_fifo_full;
assign OUT_cvo_busy = cvo_bridge_busy;
// ===| Shape Constant RAM — FMap |=============================================
logic fmap_write_enable;
logic [ 5:0] fmap_shape_read_address;
logic [16:0] fmap_arr_shape_X;
logic [16:0] fmap_arr_shape_Y;
logic [16:0] fmap_arr_shape_Z;
logic [16:0] fmap_read_arr_shape_X;
logic [16:0] fmap_read_arr_shape_Y;
logic [16:0] fmap_read_arr_shape_Z;
// ===| Shape Constant RAM — Weight |===========================================
logic weight_write_enable;
logic [ 5:0] weight_shape_read_address;
logic [16:0] weight_arr_shape_X;
logic [16:0] weight_arr_shape_Y;
logic [16:0] weight_arr_shape_Z;
logic [16:0] weight_read_arr_shape_X;
logic [16:0] weight_read_arr_shape_Y;
logic [16:0] weight_read_arr_shape_Z;
// ===| MEMSET handler |========================================================
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
fmap_write_enable <= 1'b0;
weight_write_enable <= 1'b0;
end else begin
fmap_write_enable <= 1'b0;
weight_write_enable <= 1'b0;
case (IN_mem_set_uop.dest_cache)
data_to_fmap_shape: begin
fmap_shape_read_address <= IN_mem_set_uop.dest_addr;
fmap_arr_shape_X <= IN_mem_set_uop.a_value;
fmap_arr_shape_Y <= IN_mem_set_uop.b_value;
fmap_arr_shape_Z <= IN_mem_set_uop.c_value;
fmap_write_enable <= 1'b1;
end
data_to_weight_shape: begin
weight_shape_read_address <= IN_mem_set_uop.dest_addr;
weight_arr_shape_X <= IN_mem_set_uop.a_value;
weight_arr_shape_Y <= IN_mem_set_uop.b_value;
weight_arr_shape_Z <= IN_mem_set_uop.c_value;
weight_write_enable <= 1'b1;
end
default: ;
endcase
end
end
fmap_array_shape u_fmap_shape (
.clk (clk_core),
.rst_n (rst_n_core),
.wr_en (fmap_write_enable),
.wr_addr(fmap_shape_read_address),
.wr_val0(fmap_arr_shape_X),
.wr_val1(fmap_arr_shape_Y),
.wr_val2(fmap_arr_shape_Z),
.rd_addr(fmap_shape_read_address),
.rd_val0(fmap_read_arr_shape_X),
.rd_val1(fmap_read_arr_shape_Y),
.rd_val2(fmap_read_arr_shape_Z)
);
weight_array_shape u_weight_shape (
.clk (clk_core),
.rst_n (rst_n_core),
.wr_en (weight_write_enable),
.wr_addr(weight_shape_read_address),
.wr_val0(weight_arr_shape_X),
.wr_val1(weight_arr_shape_Y),
.wr_val2(weight_arr_shape_Z),
.rd_addr(weight_shape_read_address),
.rd_val0(weight_read_arr_shape_X),
.rd_val1(weight_read_arr_shape_Y),
.rd_val2(weight_read_arr_shape_Z)
);
// ===| Shape totals (word counts for DMA) |====================================
logic [16:0] fmap_word_total;
logic [16:0] weight_word_total;
// Total BF16 elements → 128-bit words: ceil(X*Y*Z / 8)
assign fmap_word_total = (fmap_read_arr_shape_X * fmap_read_arr_shape_Y * fmap_read_arr_shape_Z + 7) >> 3;
assign weight_word_total = (weight_read_arr_shape_X * weight_read_arr_shape_Y * weight_read_arr_shape_Z + 7) >> 3;
// ===| LOAD uop → ACP / NPU command translation |==============================
logic IN_acp_rdy;
acp_uop_t acp_uop;
logic acp_rx_start;
logic IN_npu_rdy;
npu_uop_t npu_uop;
logic npu_rx_start;
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
acp_rx_start <= 1'b0;
npu_rx_start <= 1'b0;
IN_acp_rdy <= 1'b0;
IN_npu_rdy <= 1'b0;
end else begin
acp_rx_start <= 1'b0;
npu_rx_start <= 1'b0;
IN_acp_rdy <= 1'b0;
IN_npu_rdy <= 1'b0;
case (IN_LOAD_uop.data_dest)
// Host DDR4 → L2 (feature map DMA in)
from_host_to_L2: begin
acp_uop <= '{
write_en : `PORT_MOD_E_WRITE,
base_addr : IN_LOAD_uop.dest_addr,
end_addr : IN_LOAD_uop.dest_addr + 17'(fmap_word_total)
};
acp_rx_start <= 1'b1;
IN_acp_rdy <= 1'b1;
fmap_shape_read_address <= IN_LOAD_uop.shape_ptr_addr;
end
// L2 → host DDR4 (result DMA out)
from_L2_to_host: begin
acp_uop <= '{
write_en : `PORT_MOD_E_READ,
base_addr : IN_LOAD_uop.src_addr,
end_addr : IN_LOAD_uop.src_addr + 17'(fmap_word_total)
};
acp_rx_start <= 1'b1;
IN_acp_rdy <= 1'b1;
fmap_shape_read_address <= IN_LOAD_uop.shape_ptr_addr;
end
// L2 → GEMM fmap broadcast
from_L2_to_L1_GEMM: begin
npu_uop <= '{
write_en : `PORT_MOD_E_READ,
base_addr : IN_LOAD_uop.src_addr,
end_addr : IN_LOAD_uop.src_addr + 17'(fmap_word_total)
};
npu_rx_start <= 1'b1;
IN_npu_rdy <= 1'b1;
fmap_shape_read_address <= IN_LOAD_uop.shape_ptr_addr;
end
// L2 → GEMV fmap broadcast
from_L2_to_L1_GEMV: begin
npu_uop <= '{
write_en : `PORT_MOD_E_READ,
base_addr : IN_LOAD_uop.src_addr,
end_addr : IN_LOAD_uop.src_addr + 17'(fmap_word_total)
};
npu_rx_start <= 1'b1;
IN_npu_rdy <= 1'b1;
fmap_shape_read_address <= IN_LOAD_uop.shape_ptr_addr;
end
// L2 → CVO input (handled by mem_CVO_stream_bridge below)
from_L2_to_CVO: ; // bridge watches IN_CVO_uop directly
default: ;
endcase
end
end
// ===| Operation queues |======================================================
acp_uop_t OUT_acp_cmd;
npu_uop_t OUT_npu_cmd;
logic OUT_acp_cmd_valid;
logic OUT_npu_cmd_valid;
logic acp_is_busy_wire;
logic npu_is_busy_wire;
mem_u_operation_queue #() u_op_queue (
.clk_core (clk_core),
.rst_n_core (rst_n_core),
.IN_acp_rdy (IN_acp_rdy),
.IN_acp_cmd (acp_uop),
.OUT_acp_cmd (OUT_acp_cmd),
.OUT_acp_cmd_valid (OUT_acp_cmd_valid),
.OUT_acp_cmd_fifo_full(acp_cmd_fifo_full),
.IN_acp_is_busy (acp_is_busy_wire),
.IN_npu_rdy (IN_npu_rdy),
.IN_npu_cmd (npu_uop),
.OUT_npu_cmd (OUT_npu_cmd),
.OUT_npu_cmd_valid (OUT_npu_cmd_valid),
.OUT_npu_cmd_fifo_full(npu_cmd_fifo_full),
.IN_npu_is_busy (npu_is_busy_wire)
);
// ===| L2 cache controller |===================================================
// CVO bridge drives L2 port B when active; otherwise port B is driven by
// the NPU DMA state machine in mem_GLOBAL_cache.
logic cvo_l2_we;
logic [16:0] cvo_l2_addr;
logic [127:0] cvo_l2_wdata;
logic [127:0] cvo_l2_rdata;
logic npu_l2_we;
logic [16:0] npu_l2_addr;
logic [127:0] npu_l2_wdata;
logic [127:0] npu_l2_rdata;
// Port B arbitration: CVO bridge wins when busy
logic final_npu_we;
logic [16:0] final_npu_addr;
logic [127:0] final_npu_wdata;
always_comb begin
if (cvo_bridge_busy) begin
final_npu_we = cvo_l2_we;
final_npu_addr = cvo_l2_addr;
final_npu_wdata = cvo_l2_wdata;
end else begin
final_npu_we = npu_l2_we;
final_npu_addr = npu_l2_addr;
final_npu_wdata = npu_l2_wdata;
end
end
// Route L2 rdata to the appropriate consumer
assign cvo_l2_rdata = npu_l2_rdata; // shared read bus
mem_GLOBAL_cache #() u_l2_cache (
.clk_core (clk_core),
.rst_n_core (rst_n_core),
.clk_axi (clk_axi),
.rst_axi_n (rst_axi_n),
.S_AXIS_ACP_FMAP (S_AXIS_ACP_FMAP),
.M_AXIS_ACP_RESULT(M_AXIS_ACP_RESULT),
// ACP control
.IN_acp_write_en (OUT_acp_cmd.write_en),
.IN_acp_base_addr (OUT_acp_cmd.base_addr),
.IN_acp_end_addr (OUT_acp_cmd.end_addr),
.IN_acp_rx_start (OUT_acp_cmd_valid),
.OUT_acp_is_busy (acp_is_busy_wire),
// NPU port B (CVO bridge or DMA state machine)
.IN_npu_write_en (OUT_npu_cmd.write_en),
.IN_npu_base_addr (OUT_npu_cmd.base_addr),
.IN_npu_end_addr (OUT_npu_cmd.end_addr),
.IN_npu_rx_start (OUT_npu_cmd_valid),
.OUT_npu_is_busy (npu_is_busy_wire),
.IN_npu_wdata (final_npu_wdata),
.OUT_npu_rdata (npu_l2_rdata)
);
// ===| CVO Stream Bridge |=====================================================
logic cvo_bridge_done;
mem_CVO_stream_bridge u_cvo_bridge (
.clk (clk_core),
.rst_n (rst_n_core),
.IN_cvo_uop (IN_CVO_uop),
.IN_cvo_uop_valid (IN_cvo_uop_valid),
.OUT_busy (cvo_bridge_busy),
.OUT_done (cvo_bridge_done),
// L2 port B direct access
.OUT_l2_we (cvo_l2_we),
.OUT_l2_addr (cvo_l2_addr),
.OUT_l2_wdata (cvo_l2_wdata),
.IN_l2_rdata (cvo_l2_rdata),
// CVO data stream
.OUT_cvo_data (OUT_cvo_data),
.OUT_cvo_valid (OUT_cvo_valid),
.IN_cvo_data_ready (IN_cvo_data_ready),
// CVO result stream
.IN_cvo_result (IN_cvo_result),
.IN_cvo_result_valid (IN_cvo_result_valid),
.OUT_cvo_result_ready (OUT_cvo_result_ready)
);
endmodule
mem_L2_cache_fmap.sv
`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"
// ===| L2 Feature Map & KV Cache (URAM) |========================================
// True dual-port URAM: Depth x 128-bit wide.
// Default Depth = 114688 entries = 1.75 MB
//
// Port A — ACP DMA path (host DDR4 ↔ L2 via ACP)
// Port B — NPU compute (GEMM / GEMV / CVO streaming R/W)
//
// READ_LATENCY = 3 (URAM registered output, meets 400 MHz timing)
// WRITE_MODE = write_first (read-before-write on same address is undefined)
// ===============================================================================
module mem_L2_cache_fmap #(
parameter int Depth = 114688 // 128-bit word entries (1.75 MB)
) (
input logic clk_core,
input logic rst_n_core,
// ===| Port A — ACP host DMA |================================================
input logic IN_acp_we,
input logic [16:0] IN_acp_addr,
input logic [127:0] IN_acp_wdata,
output logic [127:0] OUT_acp_rdata,
// ===| Port B — NPU compute engines |=========================================
input logic IN_npu_we,
input logic [16:0] IN_npu_addr,
input logic [127:0] IN_npu_wdata,
output logic [127:0] OUT_npu_rdata
);
xpm_memory_tdpram #(
// ===| Geometry |===
.ADDR_WIDTH_A (17),
.ADDR_WIDTH_B (17),
.DATA_WIDTH_A (128),
.DATA_WIDTH_B (128),
.BYTE_WRITE_WIDTH_A (128),
.BYTE_WRITE_WIDTH_B (128),
.MEMORY_SIZE (128 * Depth),
// ===| Implementation |===
.MEMORY_PRIMITIVE ("ultra"), // Force URAM on UltraScale+
.CLOCKING_MODE ("common_clock"),
.READ_LATENCY_A (3),
.READ_LATENCY_B (3),
.WRITE_MODE_A ("write_first"),
.WRITE_MODE_B ("write_first"),
// ===| Init / Misc |===
.MEMORY_INIT_FILE ("none"),
.MEMORY_INIT_PARAM ("0"),
.USE_MEM_INIT (0),
.AUTO_SLEEP_TIME (0),
.WAKEUP_TIME ("disable_sleep"),
.ECC_MODE ("no_ecc"),
.USE_EMBEDDED_CONSTRAINT(0)
) u_l2_uram (
// Port A
.clka (clk_core),
.rsta (~rst_n_core),
.ena (1'b1),
.wea (IN_acp_we),
.addra (IN_acp_addr),
.dina (IN_acp_wdata),
.douta (OUT_acp_rdata),
.regcea (1'b1),
.injectsbiterra (1'b0),
.injectdbiterra (1'b0),
.sbiterra (),
.dbiterra (),
// Port B
.clkb (clk_core),
.rstb (~rst_n_core),
.enb (1'b1),
.web (IN_npu_we),
.addrb (IN_npu_addr),
.dinb (IN_npu_wdata),
.doutb (OUT_npu_rdata),
.regceb (1'b1),
.injectsbiterrb (1'b0),
.injectdbiterrb (1'b0),
.sbiterrb (),
.dbiterrb ()
);
endmodule
mem_HP_buffer.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"
module mem_HP_buffer #(
) (
// ===| Clock & Reset |======================================
input logic clk_core, // 400MHz
input logic rst_n_core,
input logic clk_axi, // 250MHz
input logic rst_axi_n,
// ===| HP Ports (Weight) - AXI Side |=======================
axis_if.slave S_AXI_HP0_WEIGHT,
axis_if.slave S_AXI_HP1_WEIGHT,
axis_if.slave S_AXI_HP2_WEIGHT,
axis_if.slave S_AXI_HP3_WEIGHT,
// ===| Weight Stream - Core Side (To L1 or Dispatcher) |====
axis_if.master M_CORE_HP0_WEIGHT,
axis_if.master M_CORE_HP1_WEIGHT,
axis_if.master M_CORE_HP2_WEIGHT,
axis_if.master M_CORE_HP3_WEIGHT
);
// ine Large Depth for URAM (4096 uses 2 URAM blocks per FIFO)
localparam int URAM_FIFO_DEPTH = 4096;
// [1] HP0 Weight FIFO (URAM based - Massive 64KB)
xpm_fifo_axis #(
.FIFO_DEPTH (URAM_FIFO_DEPTH),
.TDATA_WIDTH (128),
.FIFO_MEMORY_TYPE("ultra"), // Forces UltraRAM
.CLOCKING_MODE ("independent_clock")
) u_hp0_weight_fifo (
.s_aclk(clk_axi),
.s_aresetn(rst_axi_n),
.s_axis_tdata(S_AXI_HP0_WEIGHT.tdata),
.s_axis_tvalid(S_AXI_HP0_WEIGHT.tvalid),
.s_axis_tready(S_AXI_HP0_WEIGHT.tready),
.m_aclk(clk_core),
.m_axis_tdata(M_CORE_HP0_WEIGHT.tdata),
.m_axis_tvalid(M_CORE_HP0_WEIGHT.tvalid),
.m_axis_tready(M_CORE_HP0_WEIGHT.tready)
);
// [2] HP1 Weight FIFO (URAM based - Massive 64KB)
xpm_fifo_axis #(
.FIFO_DEPTH(URAM_FIFO_DEPTH),
.TDATA_WIDTH(128),
.FIFO_MEMORY_TYPE("ultra"),
.CLOCKING_MODE("independent_clock")
) u_hp1_weight_fifo (
.s_aclk(clk_axi),
.s_aresetn(rst_axi_n),
.s_axis_tdata(S_AXI_HP1_WEIGHT.tdata),
.s_axis_tvalid(S_AXI_HP1_WEIGHT.tvalid),
.s_axis_tready(S_AXI_HP1_WEIGHT.tready),
.m_aclk(clk_core),
.m_axis_tdata(M_CORE_HP1_WEIGHT.tdata),
.m_axis_tvalid(M_CORE_HP1_WEIGHT.tvalid),
.m_axis_tready(M_CORE_HP1_WEIGHT.tready)
);
// [3] HP2 Weight FIFO (URAM based - Massive 64KB)
xpm_fifo_axis #(
.FIFO_DEPTH(URAM_FIFO_DEPTH),
.TDATA_WIDTH(128),
.FIFO_MEMORY_TYPE("ultra"),
.CLOCKING_MODE("independent_clock")
) u_hp2_weight_fifo (
.s_aclk(clk_axi),
.s_aresetn(rst_axi_n),
.s_axis_tdata(S_AXI_HP2_WEIGHT.tdata),
.s_axis_tvalid(S_AXI_HP2_WEIGHT.tvalid),
.s_axis_tready(S_AXI_HP2_WEIGHT.tready),
.m_aclk(clk_core),
.m_axis_tdata(M_CORE_HP2_WEIGHT.tdata),
.m_axis_tvalid(M_CORE_HP2_WEIGHT.tvalid),
.m_axis_tready(M_CORE_HP2_WEIGHT.tready)
);
// [4] HP3 Weight FIFO (URAM based - Massive 64KB)
xpm_fifo_axis #(
.FIFO_DEPTH(URAM_FIFO_DEPTH),
.TDATA_WIDTH(128),
.FIFO_MEMORY_TYPE("ultra"),
.CLOCKING_MODE("independent_clock")
) u_hp3_weight_fifo (
.s_aclk(clk_axi),
.s_aresetn(rst_axi_n),
.s_axis_tdata(S_AXI_HP3_WEIGHT.tdata),
.s_axis_tvalid(S_AXI_HP3_WEIGHT.tvalid),
.s_axis_tready(S_AXI_HP3_WEIGHT.tready),
.m_aclk(clk_core),
.m_axis_tdata(M_CORE_HP3_WEIGHT.tdata),
.m_axis_tvalid(M_CORE_HP3_WEIGHT.tvalid),
.m_axis_tready(M_CORE_HP3_WEIGHT.tready)
);
endmodule
mem_CVO_stream_bridge.sv
`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"
import isa_pkg::*;
// ===| CVO L2 Stream Bridge |====================================================
// Bridges the 128-bit L2 port B to the 16-bit BF16 streaming interface of CVO_top.
//
// Operation flow:
// Phase 1 — READ : sequential 128-bit bursts from L2[src_addr..src_addr+N_words-1]
// → deserialise 8 x 16-bit per word → stream to CVO engine.
// CVO results are buffered in an internal XPM FIFO.
// Phase 2 — WRITE: drain FIFO → serialise 8 x 16-bit → 128-bit bursts
// → write to L2[dst_addr..dst_addr+N_words-1].
//
// L2 address unit : 128-bit words. base_addr N ↔ bytes [16*N .. 16*N+15].
// L2 read latency : 3 clocks (URAM READ_LATENCY_B = 3).
// Max vector length: 2048 elements (16-bit each = 32 KB → fits in 1 BRAM36).
// ===============================================================================
module mem_CVO_stream_bridge (
input logic clk,
input logic rst_n,
// ===| Dispatch from mem_dispatcher |========================================
input cvo_control_uop_t IN_cvo_uop,
input logic IN_cvo_uop_valid,
output logic OUT_busy,
output logic OUT_done,
// ===| L2 port B direct interface (128-bit) |================================
// Single-address mux: write takes priority over read.
output logic OUT_l2_we,
output logic [ 16:0] OUT_l2_addr,
output logic [127:0] OUT_l2_wdata,
input logic [127:0] IN_l2_rdata, // valid 3 cycles after OUT_l2_addr+~we
// ===| CVO data stream (to CVO_top.IN_data) |=================================
output logic [15:0] OUT_cvo_data,
output logic OUT_cvo_valid,
input logic IN_cvo_data_ready,
// ===| CVO result stream (from CVO_top.OUT_result) |==========================
input logic [15:0] IN_cvo_result,
input logic IN_cvo_result_valid,
output logic OUT_cvo_result_ready
);
// ===| State Machine |=========================================================
typedef enum logic [1:0] {
ST_IDLE = 2'b00,
ST_READ = 2'b01, // reading L2 → CVO (buffering outputs)
ST_WRITE = 2'b10, // draining buffer → L2
ST_DONE = 2'b11
} bridge_state_e;
bridge_state_e state;
// ===| Latched UOP |===========================================================
logic [16:0] rd_base; // L2 word address of src
logic [16:0] wr_base; // L2 word address of dst
logic [15:0] total_elems; // CVO length (elements)
logic [12:0] total_words; // ceil(total_elems / 8)
always_comb begin
total_words = 13'((total_elems + 16'd7) >> 3);
end
// ===| Read-side state |=======================================================
logic [ 12:0] rd_word_cnt; // words issued so far
logic [ 2:0] rd_elem_idx; // current element within 128-bit deser buffer
logic [127:0] rd_deser_buf; // latched 128-bit L2 word
logic rd_buf_valid; // deser buffer holds valid data
logic [ 15:0] elems_fed; // elements delivered to CVO
// 3-cycle read latency tracking
logic [ 2:0] rd_lat_pipe; // shift register: [2]=oldest, [0]=newest
// ===| Write-side state |======================================================
logic [ 2:0] wr_elem_idx; // accumulation index 0..7
logic [127:0] wr_ser_buf; // serialisation buffer
logic [ 12:0] wr_word_cnt; // words written so far
logic [ 15:0] elems_result; // results drained from FIFO
// ===| Output FIFO (CVO results → write buffer) |==============================
// XPM FIFO sync, depth=2048, width=16 bit (max 32 KB = 1 BRAM36)
logic fifo_wr_en;
logic fifo_rd_en;
logic [ 15:0] fifo_dout;
logic fifo_empty;
logic fifo_full;
assign fifo_wr_en = IN_cvo_result_valid && (state == ST_READ);
assign OUT_cvo_result_ready = ~fifo_full && (state == ST_READ);
xpm_fifo_sync #(
.FIFO_DEPTH (2048),
.WRITE_DATA_WIDTH(16),
.READ_DATA_WIDTH (16),
.FIFO_MEMORY_TYPE("block"),
.READ_MODE ("std"),
.FULL_RESET_VALUE(0)
) u_result_fifo (
.sleep (1'b0),
.rst (~rst_n),
.wr_clk(clk),
.rd_clk(clk),
.wr_en (fifo_wr_en),
.din (IN_cvo_result),
.rd_en (fifo_rd_en),
.dout (fifo_dout),
.empty (fifo_empty),
.full (fifo_full)
);
// ===| Main FSM |==============================================================
always_ff @(posedge clk) begin
if (!rst_n) begin
state <= ST_IDLE;
rd_base <= '0;
wr_base <= '0;
total_elems <= '0;
rd_word_cnt <= '0;
rd_elem_idx <= '0;
rd_deser_buf <= '0;
rd_buf_valid <= 1'b0;
rd_lat_pipe <= 3'b0;
elems_fed <= '0;
wr_elem_idx <= '0;
wr_ser_buf <= '0;
wr_word_cnt <= '0;
elems_result <= '0;
OUT_done <= 1'b0;
end else begin
OUT_done <= 1'b0;
case (state)
// ===| IDLE: latch uop, convert element addresses to word addresses |===
ST_IDLE: begin
if (IN_cvo_uop_valid) begin
// src/dst are element (16-bit) addresses; divide by 8 for 128-bit words
rd_base <= 17'(IN_cvo_uop.src_addr >> 3);
wr_base <= 17'(IN_cvo_uop.dst_addr >> 3);
total_elems <= IN_cvo_uop.length;
rd_word_cnt <= '0;
rd_elem_idx <= '0;
rd_buf_valid <= 1'b0;
rd_lat_pipe <= 3'b0;
elems_fed <= '0;
wr_elem_idx <= '0;
wr_ser_buf <= '0;
wr_word_cnt <= '0;
elems_result <= '0;
state <= ST_READ;
end
end
// ===| READ: stream L2 → CVO; capture results in FIFO |================
ST_READ: begin
// Advance latency shift register
rd_lat_pipe <= {rd_lat_pipe[1:0], 1'b0};
// Issue next L2 read when deser buffer is empty (pre-fetch when 3 left)
if (!rd_buf_valid && rd_word_cnt < 13'(total_words)) begin
rd_lat_pipe[0] <= 1'b1; // mark new read outstanding
rd_word_cnt <= rd_word_cnt + 13'd1;
end
// Capture L2 data 3 cycles after read issued
if (rd_lat_pipe[2]) begin
rd_deser_buf <= IN_l2_rdata;
rd_buf_valid <= 1'b1;
rd_elem_idx <= 3'd0;
end
// Feed CVO one element per cycle from deser buffer
if (rd_buf_valid && IN_cvo_data_ready) begin
rd_elem_idx <= rd_elem_idx + 3'd1;
elems_fed <= elems_fed + 16'd1;
if (rd_elem_idx == 3'd7 || elems_fed + 16'd1 == total_elems) begin
rd_buf_valid <= 1'b0;
end
end
// Transition when all elements have been fed and all results captured
if (elems_fed == total_elems && !fifo_empty) begin
state <= ST_WRITE;
end
end
// ===| WRITE: drain FIFO → L2 |=========================================
ST_WRITE: begin
if (!fifo_empty) begin
wr_ser_buf <= {fifo_dout, wr_ser_buf[127:16]};
wr_elem_idx <= wr_elem_idx + 3'd1;
elems_result <= elems_result + 16'd1;
end
// When 8 elements accumulated (or last partial word), write to L2
if (wr_elem_idx == 3'd7 || elems_result == total_elems) begin
wr_word_cnt <= wr_word_cnt + 13'd1;
wr_elem_idx <= 3'd0;
end
if (elems_result == total_elems && fifo_empty) begin
state <= ST_DONE;
end
end
// ===| DONE: pulse, return to IDLE |====================================
ST_DONE: begin
OUT_done <= 1'b1;
state <= ST_IDLE;
end
default: state <= ST_IDLE;
endcase
end
end
// ===| FIFO read enable (draining during WRITE phase) |========================
assign fifo_rd_en = (state == ST_WRITE) && !fifo_empty;
// ===| L2 port B output mux ===================================================
// Priority: write (WRITE phase) > read (READ phase)
always_comb begin
OUT_l2_we = 1'b0;
OUT_l2_addr = '0;
OUT_l2_wdata = '0;
if (state == ST_WRITE && wr_elem_idx == 3'd0 && wr_word_cnt > 0) begin
// Write accumulated 128-bit word to dst
OUT_l2_we = 1'b1;
OUT_l2_addr = 17'(wr_base + (wr_word_cnt - 13'd1));
OUT_l2_wdata = wr_ser_buf;
end else if (state == ST_READ && rd_lat_pipe[0]) begin
// Issue read for next 128-bit word from src
OUT_l2_we = 1'b0;
OUT_l2_addr = 17'(rd_base + (rd_word_cnt - 13'd1));
end
end
// ===| CVO data output ========================================================
// Mux the correct 16-bit slice from the deser buffer
always_comb begin
OUT_cvo_data = rd_deser_buf[rd_elem_idx*16+:16];
OUT_cvo_valid = rd_buf_valid && (state == ST_READ);
end
// ===| Status |================================================================
assign OUT_busy = (state != ST_IDLE);
endmodule
메모리 모듈¶
mem_GLOBAL_cache.sv— L2 URAM 의 물리적 백킹으로 쓰이는 매개변수화된 글로벌 캐시 블록.mem_BUFFER.sv— HP 버퍼와 CVO 브리지가 사용하는 범용 ping-pong 버퍼.
mem_GLOBAL_cache.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"
`include "mem_IO.svh"
import isa_pkg::*;
// ===| Global L2 Cache Controller |==============================================
// Wraps mem_L2_cache_fmap (URAM) and provides two access controllers:
//
// Port A — ACP DMA : host DDR4 ↔ L2 via AXI-Stream (with CDC FIFO via mem_BUFFER)
// Port B — NPU : compute engines (GEMM / GEMV / CVO) streaming R/W
//
// Arbitration: Port B is driven externally via IN_npu_* signals.
// Address unit: 128-bit words (address 0 = first 128-bit line).
// ===============================================================================
module mem_GLOBAL_cache (
input logic clk_core,
input logic rst_n_core,
input logic clk_axi,
input logic rst_axi_n,
// ===| AXI-Stream ACP (external, AXI clock domain) |========================
axis_if.slave S_AXIS_ACP_FMAP, // feature map in (128-bit, from PS/DDR4)
axis_if.master M_AXIS_ACP_RESULT, // result out (128-bit, to PS/DDR4)
// ===| Port A — ACP DMA control |============================================
input logic IN_acp_write_en, // 1=write (DDR→L2), 0=read (L2→DDR)
input logic [16:0] IN_acp_base_addr,
input logic IN_acp_rx_start, // start ACP transfer
input logic [16:0] IN_acp_end_addr,
output logic OUT_acp_is_busy,
// ===| Port B — NPU compute direct access |==================================
input logic IN_npu_write_en,
input logic [16:0] IN_npu_base_addr,
input logic IN_npu_rx_start,
input logic [16:0] IN_npu_end_addr,
output logic OUT_npu_is_busy,
input logic [127:0] IN_npu_wdata,
output logic [127:0] OUT_npu_rdata
);
// ===| ACP CDC FIFO (AXI → Core clock) |=======================================
axis_if #(.DATA_WIDTH(128)) core_acp_rx_bus ();
axis_if #(.DATA_WIDTH(128)) core_acp_tx_bus ();
mem_BUFFER u_acp_cdc (
.clk_core (clk_core),
.rst_n_core (rst_n_core),
.clk_axi (clk_axi),
.rst_axi_n (rst_axi_n),
.S_AXIS_ACP_FMAP (S_AXIS_ACP_FMAP),
.M_AXIS_ACP_RESULT(M_AXIS_ACP_RESULT),
.M_CORE_ACP_RX (core_acp_rx_bus),
.S_CORE_ACP_TX (core_acp_tx_bus)
);
// ===| Port A — ACP state machine (core clock domain) |=======================
logic [16:0] acp_ptr;
logic acp_write_en;
logic acp_is_busy;
logic [16:0] acp_end_addr;
assign OUT_acp_is_busy = acp_is_busy;
// ACP read pipeline: URAM READ_LATENCY=3
logic [2:0] acp_rd_valid_pipe;
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
acp_rd_valid_pipe <= 3'b000;
end else begin
acp_rd_valid_pipe <= {acp_rd_valid_pipe[1:0], (acp_is_busy & ~acp_write_en)};
end
end
assign core_acp_tx_bus.tvalid = acp_rd_valid_pipe[2];
assign core_acp_tx_bus.tkeep = '1;
assign core_acp_tx_bus.tlast = 1'b0;
assign core_acp_rx_bus.tready = acp_is_busy & acp_write_en;
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
acp_ptr <= '0;
acp_end_addr <= '0;
acp_is_busy <= 1'b0;
acp_write_en <= 1'b0;
end else begin
if (acp_is_busy) begin
if (acp_write_en) begin
if (core_acp_rx_bus.tvalid) begin
acp_ptr <= acp_ptr + 17'd1;
if (acp_ptr + 17'd1 >= acp_end_addr) acp_is_busy <= 1'b0;
end
end else begin
if (core_acp_tx_bus.tready) begin
acp_ptr <= acp_ptr + 17'd1;
if (acp_ptr + 17'd1 >= acp_end_addr) acp_is_busy <= 1'b0;
end
end
end else if (IN_acp_rx_start) begin
acp_ptr <= IN_acp_base_addr;
acp_end_addr <= IN_acp_end_addr;
acp_is_busy <= 1'b1;
acp_write_en <= IN_acp_write_en;
end
end
end
// ===| Port B — NPU state machine |============================================
logic [16:0] npu_ptr;
logic npu_write_en;
logic npu_is_busy;
logic [16:0] npu_end_addr;
assign OUT_npu_is_busy = npu_is_busy;
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
npu_ptr <= '0;
npu_end_addr <= '0;
npu_is_busy <= 1'b0;
npu_write_en <= 1'b0;
end else begin
if (npu_is_busy) begin
npu_ptr <= npu_ptr + 17'd1;
if (npu_ptr + 17'd1 >= npu_end_addr) npu_is_busy <= 1'b0;
end else if (IN_npu_rx_start) begin
npu_ptr <= IN_npu_base_addr;
npu_end_addr <= IN_npu_end_addr;
npu_is_busy <= 1'b1;
npu_write_en <= IN_npu_write_en;
end
end
end
// ===| L2 URAM (port B shared between ACP read-out and NPU compute) |=========
// Port A → ACP DMA (write when host→L2, read when L2→host)
// Port B → NPU compute (fmap broadcast, CVO streaming)
mem_L2_cache_fmap #(
.Depth(114688)
) u_l2_uram (
.clk_core (clk_core),
.rst_n_core (rst_n_core),
// Port A — ACP
.IN_acp_we (acp_write_en & core_acp_rx_bus.tvalid),
.IN_acp_addr (acp_ptr),
.IN_acp_wdata(core_acp_rx_bus.tdata),
.OUT_acp_rdata(core_acp_tx_bus.tdata),
// Port B — NPU compute
.IN_npu_we (npu_write_en & npu_is_busy),
.IN_npu_addr (npu_ptr),
.IN_npu_wdata (IN_npu_wdata),
.OUT_npu_rdata(OUT_npu_rdata)
);
endmodule
mem_BUFFER.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"
module mem_BUFFER (
// ===| Clock & Reset |======================================
input logic clk_core, // 400MHz
input logic rst_n_core,
input logic clk_axi, // 250MHz
input logic rst_axi_n,
// ===| ACP Ports (FMAP/KV) |================================
axis_if.slave S_AXIS_ACP_FMAP, // [RX] Data from DDR4 to NPU
axis_if.master M_AXIS_ACP_RESULT, // [TX] Data from NPU to DDR4
axis_if.master M_CORE_ACP_RX, // [RX] Converted to 400MHz Core
axis_if.slave S_CORE_ACP_TX // [TX] Coming from 400MHz Core
);
//fine Tiny Depth for BRAM CDC
localparam int BRAM_FIFO_DEPTH = 32;
// [1] ACP RX FIFO (CDC only: AXI -> Core)
// FMAP/KV is handled by the massive L2 URAM Cache, so these CDC FIFOs stay TINY.
xpm_fifo_axis #(
.FIFO_DEPTH (BRAM_FIFO_DEPTH),
.TDATA_WIDTH (128),
.FIFO_MEMORY_TYPE("block"), // BRAM is enough for CDC
.CLOCKING_MODE ("independent_clock")
) u_acp_rx_fifo (
.s_aclk(clk_axi),
.s_aresetn(rst_axi_n),
.s_axis_tdata(S_AXIS_ACP_FMAP.tdata),
.s_axis_tvalid(S_AXIS_ACP_FMAP.tvalid),
.s_axis_tready(S_AXIS_ACP_FMAP.tready),
.m_aclk(clk_core),
.m_axis_tdata(M_CORE_ACP_RX.tdata),
.m_axis_tvalid(M_CORE_ACP_RX.tvalid),
.m_axis_tready(M_CORE_ACP_RX.tready)
);
// [2] ACP TX FIFO (CDC only: Core -> AXI)
xpm_fifo_axis #(
.FIFO_DEPTH(BRAM_FIFO_DEPTH),
.TDATA_WIDTH(128),
.FIFO_MEMORY_TYPE("block"),
.CLOCKING_MODE("independent_clock")
) u_acp_tx_fifo (
// a flows FROM the Core domain...
.s_aclk(clk_core),
.s_aresetn(rst_n_core),
.s_axis_tdata(S_CORE_ACP_TX.tdata),
.s_axis_tvalid(S_CORE_ACP_TX.tvalid),
.s_axis_tready(S_CORE_ACP_TX.tready),
//.TO the AXI domain (DDR4)
.m_aclk(clk_axi),
.m_axis_tdata(M_AXIS_ACP_RESULT.tdata),
.m_axis_tvalid(M_AXIS_ACP_RESULT.tvalid),
.m_axis_tready(M_AXIS_ACP_RESULT.tready)
);
endmodule
상수 메모리 (shape · size)¶
fmap_array_shape.sv—shape_ptr_addr가 참조하는 피처맵 shape 디스크립터 상수 메모리.weight_array_shape.sv— 가중치 shape 용으로 같은 구조.
fmap_array_shape.sv
`timescale 1ns / 1ps
// ============================================================
// shape_ram
// - Depth : 64 entries (6-bit address)
// - Width : 51 bits (17-bit × 3 fields)
//
// [write] wr_en=1, wr_addr, wr_val{0,1,2} → next clk
// [ read] rd_addr → at same clk rd_val{0,1,2} out (comb logic)
// ============================================================
module fmap_array_shape (
input logic clk,
input logic rst_n,
// ===| write |===
input logic wr_en,
input logic [ 5:0] wr_addr,
input logic [16:0] wr_val0, // shape: x
input logic [16:0] wr_val1, // shape: y
input logic [16:0] wr_val2, // shape: z
// ===| read |===
input ptr_addr_t rd_addr,
output logic [16:0] rd_val0, // shape: x
output logic [16:0] rd_val1, // shape: y
output logic [16:0] rd_val2 // shape: z
);
// 64 × 51 bit REGISTER array
// [50:34] = val2 / [33:17] = val1 / [16:0] = val0
logic [50:0] mem[0:63];
// ===| write (sync to clk) |===
always_ff @(posedge clk) begin
if (!rst_n) begin
for (int i = 0; i < 64; i++) mem[i] <= 51'd0;
end else begin
if (wr_en) begin
mem[wr_addr] <= {wr_val2, wr_val1, wr_val0};
end
end
end
// ===| read (comb logic - latency 0) |===
assign rd_val0 = mem[rd_addr][16:0];
assign rd_val1 = mem[rd_addr][33:17];
assign rd_val2 = mem[rd_addr][50:34];
endmodule
weight_array_shape.sv
`timescale 1ns / 1ps
// ============================================================
// shape_ram
// - Depth : 64 entries (6-bit address)
// - Width : 51 bits (17-bit × 3 fields)
//
// [write] wr_en=1, wr_addr, wr_val{0,1,2} → next clk
// [ read] rd_addr → at same clk rd_val{0,1,2} out (comb logic)
// ============================================================
module weight_array_shape (
input logic clk,
input logic rst_n,
// ===| write |===
input logic wr_en,
input logic [ 5:0] wr_addr,
input logic [16:0] wr_val0, // shape: x
input logic [16:0] wr_val1, // shape: y
input logic [16:0] wr_val2, // shape: z
// ===| read |===
input ptr_addr_t rd_addr,
output logic [16:0] rd_val0, // shape: x
output logic [16:0] rd_val1, // shape: y
output logic [16:0] rd_val2 // shape: z
);
// 64 × 51 bit REGISTER array
// [50:34] = val2 / [33:17] = val1 / [16:0] = val0
logic [50:0] mem[0:63];
// ===| write (sync to clk) |===
always_ff @(posedge clk) begin
if (!rst_n) begin
for (int i = 0; i < 64; i++) mem[i] <= 51'd0;
end else begin
if (wr_en) begin
mem[wr_addr] <= {wr_val2, wr_val1, wr_val0};
end
end
end
// ===| read (comb logic - latency 0) |===
assign rd_val0 = mem[rd_addr][16:0];
assign rd_val1 = mem[rd_addr][33:17];
assign rd_val2 = mem[rd_addr][50:34];
endmodule
IO¶
mem_IO.svh— AXI / ACP 핀 단위 타입과 매개변수.mem_u_operation_queue.sv— 컨트롤러와 메모리 디스패처 사이의 마이크로 옵 큐.
mem_IO.svh
// ===| Memory Port Mode Constants |==============================================
// Used by mem_GLOBAL_cache and mem_dispatcher to distinguish read vs. write
// accesses on the ACP and NPU ports.
// ===============================================================================
`ifndef MEM_IO_SVH
`define MEM_IO_SVH
`define PORT_MOD_E_WRITE 1'b1 // Port is in write (sink) mode
`define PORT_MOD_E_READ 1'b0 // Port is in read (source) mode
`endif // MEM_IO_SVH
mem_u_operation_queue.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"
import isa_pkg::*;
// ===| Memory Operation Queue |==================================================
// Decouples the scheduler from the L2 cache controller.
// Two independent FIFO channels: ACP (host DMA) and NPU (internal compute).
//
// acp_uop_t / npu_uop_t are both 35-bit packed structs:
// {write_en[0], base_addr[16:0], end_addr[16:0]} = 1+17+17 = 35 bits
// ===============================================================================
module mem_u_operation_queue #() (
input logic clk_core,
input logic rst_n_core,
// ===| ACP channel |=========================================================
input logic IN_acp_rdy,
input acp_uop_t IN_acp_cmd,
output acp_uop_t OUT_acp_cmd,
output logic OUT_acp_cmd_valid,
output logic OUT_acp_cmd_fifo_full,
input logic IN_acp_is_busy,
// ===| NPU internal channel |================================================
input logic IN_npu_rdy,
input npu_uop_t IN_npu_cmd,
output npu_uop_t OUT_npu_cmd,
output logic OUT_npu_cmd_valid,
output logic OUT_npu_cmd_fifo_full,
input logic IN_npu_is_busy
);
localparam int UopWidth = 35; // 1 + 17 + 17 = write_en + base_addr + end_addr
logic acp_fifo_empty;
logic acp_fifo_full;
logic npu_fifo_empty;
logic npu_fifo_full;
assign OUT_acp_cmd_fifo_full = acp_fifo_full;
assign OUT_npu_cmd_fifo_full = npu_fifo_full;
always_comb begin
OUT_acp_cmd_valid = ~IN_acp_is_busy & ~acp_fifo_empty;
OUT_npu_cmd_valid = ~IN_npu_is_busy & ~npu_fifo_empty;
end
// ===| ACP FIFO |==============================================================
xpm_fifo_sync #(
.FIFO_DEPTH (128),
.WRITE_DATA_WIDTH (UopWidth),
.READ_DATA_WIDTH (UopWidth),
.FIFO_MEMORY_TYPE ("block"),
.READ_MODE ("std"),
.FULL_RESET_VALUE (0),
.PROG_FULL_THRESH (100)
) u_acp_uop_fifo (
.sleep (1'b0),
.rst (~rst_n_core),
.wr_clk (clk_core),
.wr_en (IN_acp_rdy & ~acp_fifo_full),
.din (IN_acp_cmd),
.prog_full(acp_fifo_full),
.rd_en (~IN_acp_is_busy & ~acp_fifo_empty),
.dout (OUT_acp_cmd),
.empty (acp_fifo_empty),
.rd_clk (clk_core)
);
// ===| NPU FIFO |==============================================================
xpm_fifo_sync #(
.FIFO_DEPTH (128),
.WRITE_DATA_WIDTH (UopWidth),
.READ_DATA_WIDTH (UopWidth),
.FIFO_MEMORY_TYPE ("block"),
.READ_MODE ("std"),
.FULL_RESET_VALUE (0),
.PROG_FULL_THRESH (100)
) u_npu_uop_fifo (
.sleep (1'b0),
.rst (~rst_n_core),
.wr_clk (clk_core),
.wr_en (IN_npu_rdy & ~npu_fifo_full),
.din (IN_npu_cmd),
.prog_full(npu_fifo_full),
.rd_en (~IN_npu_is_busy & ~npu_fifo_empty),
.dout (OUT_npu_cmd),
.empty (npu_fifo_empty),
.rd_clk (clk_core)
);
endmodule