NPU Controller Modules¶
RTL source on GitHub
SystemVerilog sources documented on this page:
hw/rtl/NPU_Controller/npu_controller_top.sv— View on GitHubhw/rtl/NPU_Controller/NPU_Control_Unit/ctrl_npu_decoder.sv— View on GitHubhw/rtl/NPU_Controller/NPU_Control_Unit/ctrl_npu_dispatcher.sv— View on GitHubhw/rtl/NPU_Controller/Global_Scheduler.sv— View on GitHub
1. Controller Top¶
npu_controller_top.sv integrates the AXI-Lite frontend, instruction
decoder, dispatcher, and global scheduler into a single unit.
`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"
import isa_pkg::*;
// ===| NPU Controller Top |======================================================
// Wraps the AXI-Lite frontend and the opcode decoder.
// Outputs one valid pulse per instruction type along with the raw 60-bit body.
// ===============================================================================
module npu_controller_top #() (
input logic clk,
input logic rst_n,
input logic i_clear,
// ===| AXI4-Lite Slave : PS <-> NPU control plane |=========================
axil_if.slave S_AXIL_CTRL,
// ===| Decoded Instruction Valids |=========================================
output logic OUT_GEMV_op_x64_valid,
output logic OUT_GEMM_op_x64_valid,
output logic OUT_memcpy_op_x64_valid,
output logic OUT_memset_op_x64_valid,
output logic OUT_cvo_op_x64_valid,
// ===| Raw Instruction Body (60-bit, opcode stripped) |=====================
output instruction_op_x64_t OUT_op_x64
);
// ===| Internal Wires |========================================================
logic [`ISA_WIDTH-1:0] raw_instruction;
logic raw_instruction_pop_valid;
logic fetch_PC_ready;
// ===| Frontend : AXI-Lite CMD/STAT |==========================================
ctrl_npu_frontend #() u_npu_frontend (
.clk (clk),
.rst_n (rst_n),
.IN_clear(i_clear),
.S_AXIL_CTRL(S_AXIL_CTRL),
.OUT_RAW_instruction(raw_instruction),
.OUT_kick (raw_instruction_pop_valid),
.IN_enc_stat ('0),
.IN_enc_valid(1'b0),
.IN_fetch_ready(fetch_PC_ready)
);
// ===| Decoder : Opcode -> Engine FIFOs |======================================
ctrl_npu_decoder u_decoder (
.clk (clk),
.rst_n (rst_n),
.IN_raw_instruction (raw_instruction),
.raw_instruction_pop_valid(raw_instruction_pop_valid),
.OUT_fetch_PC_ready (fetch_PC_ready),
.OUT_GEMV_op_x64_valid (OUT_GEMV_op_x64_valid),
.OUT_GEMM_op_x64_valid (OUT_GEMM_op_x64_valid),
.OUT_memcpy_op_x64_valid(OUT_memcpy_op_x64_valid),
.OUT_memset_op_x64_valid(OUT_memset_op_x64_valid),
.OUT_cvo_op_x64_valid (OUT_cvo_op_x64_valid),
.OUT_op_x64(OUT_op_x64)
);
endmodule
2. Instruction Decoder¶
ctrl_npu_decoder.sv parses the 64-bit VLIW instruction word: strips
the 4-bit opcode and routes the 60-bit body into the appropriate
typed struct (GEMV_op_x64_t, memcpy_op_x64_t, etc.).
`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"
import isa_pkg::*;
// ===| NPU Opcode Decoder |======================================================
// Receives raw 64-bit VLIW instructions from the frontend FIFO.
// Strips the 4-bit opcode, asserts the matching valid pulse for one cycle,
// and forwards the 60-bit body to the Global Scheduler.
// ===============================================================================
module ctrl_npu_decoder (
input logic clk,
input logic rst_n,
// ===| From Frontend |=======================================================
input logic [`ISA_WIDTH-1:0] IN_raw_instruction,
input logic raw_instruction_pop_valid,
// ===| Flow Control |========================================================
output logic OUT_fetch_PC_ready,
// ===| Decoded Valid Pulses (one-hot, one cycle) |===========================
output logic OUT_GEMV_op_x64_valid,
output logic OUT_GEMM_op_x64_valid,
output logic OUT_memcpy_op_x64_valid,
output logic OUT_memset_op_x64_valid,
output logic OUT_cvo_op_x64_valid,
// ===| Instruction Body (60-bit, opcode stripped) |=========================
output instruction_op_x64_t OUT_op_x64
);
// ===| Internal |==============================================================
logic [3:0] OUT_valid;
assign OUT_GEMV_op_x64_valid = OUT_valid[0];
assign OUT_GEMM_op_x64_valid = OUT_valid[1];
assign OUT_memcpy_op_x64_valid = OUT_valid[2];
assign OUT_memset_op_x64_valid = OUT_valid[3];
// CVO valid uses a separate FF (5th opcode)
logic cvo_valid_ff;
assign OUT_cvo_op_x64_valid = cvo_valid_ff;
// ===| Opcode Decoder |========================================================
// Top 4 bits are the opcode; bottom 60 bits are the instruction body.
always_ff @(posedge clk) begin
if (!rst_n) begin
OUT_valid <= 4'b0000;
cvo_valid_ff <= 1'b0;
OUT_fetch_PC_ready <= `TRUE;
OUT_op_x64 <= '0;
end else begin
OUT_valid <= 4'b0000;
cvo_valid_ff <= 1'b0;
if (raw_instruction_pop_valid) begin
// Body: bits [59:0] (opcode at [63:60] already stripped by slicing)
OUT_op_x64.instruction <= IN_raw_instruction[`ISA_BODY_WIDTH-1:0];
case (IN_raw_instruction[`ISA_WIDTH-1:`ISA_WIDTH-`ISA_OPCODE_WIDTH])
OP_GEMV: OUT_valid <= 4'b0001;
OP_GEMM: OUT_valid <= 4'b0010;
OP_MEMCPY: OUT_valid <= 4'b0100;
OP_MEMSET: OUT_valid <= 4'b1000;
OP_CVO: cvo_valid_ff <= 1'b1;
default: ; // unknown opcode: drop silently
endcase
end
end
end
// ===| Backpressure |==========================================================
// Always ready — the frontend FIFO provides buffering; the decoder is single-cycle.
assign OUT_fetch_PC_ready = 1'b1;
endmodule
3. Instruction Dispatcher¶
ctrl_npu_dispatcher.sv resolves Constant Cache pointer lookups
(shape / size / scale), checks for address and resource hazards, and
issues per-core control μops to GEMM, GEMV, CVO, and mem_dispatcher.
// `timeOUT_scale 1ns / 1ps
// `include "GEMM_Array.svh"
// `include "npu_interfaces.svh"
// `include "GLOBAL_CONST.svh"
// import isa_pkg::*;
// module cu_npu_dispatcher (
// input logic clk,
// input logic rst_n,
// input instruction_t IN_inst,
// input logic IN_valid,
// output logic o_valid,
// // GEMV / GEMM controls
// output logic [3:0] OUT_activate_top,
// output logic [3:0] OUT_activate_lane,
// output logic OUT_result_emax_align,
// output logic OUT_result_accm,
// output logic OUT_result_scale,
// // memcpy
// output memory_uop_t OUT_memcpy_cmd,
// // if INT group size?
// );
// /*─────────────────────────────────────────────
// Lane activation bitmask
// bit[0]=lane1, bit[1]=lane2 ...
// ─────────────────────────────────────────────*/
// localparam logic [3:0] LANE_1 = 4'b0001;
// localparam logic [3:0] LANE_2 = 4'b0010;
// localparam logic [3:0] LANE_3 = 4'b0100;
// localparam logic [3:0] LANE_4 = 4'b1000;
// always_ff @(posedge clk) begin
// if (!rst_n) begin
// o_valid <= 1'b0;
// OUT_activate_lane <= '0;
// OUT_result_emax_align <= 1'b0;
// OUT_result_accm <= 1'b0;
// OUT_result_scale <= 1'b0;
// OUT_memcpy_destination_queue <= '0;
// for (int i = 0; i < `MAX_MATRIX_DIM; i++) OUT_memcpy_matrix_shape[i] <= '0;
// end else begin
// o_valid <= 1'b0; // default : deassert every cycle
// if (IN_valid) begin
// case (IN_inst.opcode)
// OP_GEMV: begin
// o_valid <= 1'b1;
// if (IN_inst.cmd_chaining) begin
// // TODO: chaining logic
// end
// if (IN_inst.override) begin
// // TODO: override logic
// end
// // lane activation (OR mask, cumulative)
// case (IN_inst.payload.dotm.lane_idq)
// 2'b00: OUT_activate_lane <= LANE_1;
// 2'b01: OUT_activate_lane <= LANE_1 | LANE_2;
// 2'b10: OUT_activate_lane <= LANE_1 | LANE_2 | LANE_3;
// 2'b11: OUT_activate_lane <= LANE_1 | LANE_2 | LANE_3 | LANE_4;
// default: begin
// o_valid <= 1'b0; // unknown → drop + TODO: interrupt
// end
// endcase
// OUT_result_emax_align <= IN_inst.payload.dotm.find_emax_align;
// OUT_result_accm <= IN_inst.payload.dotm.OUT_result_accm;
// // activate when added to ISA
// // OUT_result_scale <= IN_inst.payload.dotm.OUT_result_scale;
// OUT_activate_top[`TOP_GEMV] <= `TRUE;
// end
// OP_GEMM: begin
// if (IN_inst.override) begin
// if (IN_inst.cmd_chaining) begin
// // TODO
// end else begin
// // TODO
// end
// end else begin
// if (IN_inst.cmd_chaining) begin
// // TODO
// end else begin
// o_valid <= 1'b1;
// OUT_result_emax_align <= IN_inst.payload.dotm.align;
// OUT_result_accm <= IN_inst.payload.dotm.OUT_result_accm;
// OUT_activate_top[`TOP_GEMV] <= `TRUE;
// end
// end
// end
// OP_MEMCPY: begin
// if (IN_inst.override) begin
// if (IN_inst.cmd_chaining) begin
// // accumulate matrix shape across chained instructions
// OUT_memcpy_matrix_shape[IN_inst.payload.memcpy.dim_xyz]
// <= IN_inst.payload.memcpy.dim_x;
// end else begin
// // chaining end → dispatch memcpy
// o_valid <= 1'b1;
// OUT_memcpy_destination_queue <= IN_inst.payload.memcpy.dest_queue;
// case (IN_inst.payload.memcpy.dest_queue[3:2])
// `MASKING_WEIGHT: begin
// // TODO: → weight buffer
// end
// `MASKING_OUT_scale: begin
// // TODO: ACP → OUT_result_scale cache
// end
// `MASKING_FMAP: begin
// // TODO: ACP → find emax & align → cache
// end
// default: o_valid <= 1'b0; // undefined
// endcase
// end
// end else begin
// // non-override memcpy
// // TODO
// end
// // Determine logic based on datatype and mask IN_inst.payload.memcpy.option_flags using bitwise AND (&)
// if (IN_inst.payload.memcpy.datatype == `BF16) begin
// // Example: BF16 processing mode
// // Check if the 4th bit (ALIGN) is set to 1
// if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN) != 4'b0000) begin
// OUT_align <= `TRUE;
// // Determine the alignment direction
// if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN_V) != 4'b0000) begin
// OUT_align_dir <= `ALIGN_VERTICAL;
// end else if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN_H) != 4'b0000) begin
// OUT_align_dir <= `ALIGN_HORIZONTAL;
// end else begin
// // Default direction if neither V nor H is specified
// end
// end else begin
// // If ALIGN flag is missing
// OUT_align <= `FALSE;
// end
// end else begin
// // Example: INT processing mode
// if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_OPT_INT_IS_SCALED) != 4'b0000) begin
// // Logic for scaled INT
// OUT_align <= `TRUE; // (Adjust according to your actual spec)
// end else begin
// OUT_align <= `FALSE;
// end
// end
// OUT_datatype <= IN_inst.payload.memcpy.datatype;
// end
// default: o_valid <= 1'b0; // unknown opcode → drop
// endcase
// end
// end
// end
// endmodule
4. Global Scheduler¶
Global_Scheduler.sv tracks in-flight async instructions, maintains
the dependency scoreboard, and gates new dispatches when a hazard is
detected.
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"
import isa_pkg::*;
// ===| Global Scheduler |========================================================
// Translates decoded VLIW instructions into engine micro-ops.
//
// Single always_ff drives each output to avoid multiple-driver conflicts.
// Priority for OUT_LOAD_uop: GEMM > GEMV > MEMCPY > CVO (one active per cycle).
//
// OUT_STORE_uop : registered at issue time; mem_dispatcher uses it to initiate
// result writeback after the engine signals completion.
// OUT_sram_rd_start : one-cycle pulse when a GEMM or GEMV load is dispatched,
// triggering preprocess_fmap to begin broadcasting from cache.
// ===============================================================================
module Global_Scheduler #() (
input logic clk_core,
input logic rst_n_core,
// ===| From ctrl_npu_decoder |===============================================
input logic IN_GEMV_op_x64_valid,
input logic IN_GEMM_op_x64_valid,
input logic IN_memcpy_op_x64_valid,
input logic IN_memset_op_x64_valid,
input logic IN_cvo_op_x64_valid,
input instruction_op_x64_t instruction,
// ===| Engine micro-ops |====================================================
output gemm_control_uop_t OUT_GEMM_uop,
output GEMV_control_uop_t OUT_GEMV_uop,
output memory_control_uop_t OUT_LOAD_uop,
output memory_control_uop_t OUT_STORE_uop,
output memory_set_uop_t OUT_mem_set_uop,
output cvo_control_uop_t OUT_CVO_uop,
// ===| Datapath control |====================================================
output logic OUT_sram_rd_start // pulse: start fmap cache broadcast
);
// ===| Combinational instruction body casts |==================================
GEMV_op_x64_t GEMV_op_x64;
GEMM_op_x64_t GEMM_op_x64;
memcpy_op_x64_t memcpy_op_x64;
memset_op_x64_t memset_op_x64;
cvo_op_x64_t cvo_op_x64;
always_comb begin
GEMV_op_x64 = GEMV_op_x64_t'(instruction.instruction);
GEMM_op_x64 = GEMM_op_x64_t'(instruction.instruction);
memcpy_op_x64 = memcpy_op_x64_t'(instruction.instruction);
memset_op_x64 = memset_op_x64_t'(instruction.instruction);
cvo_op_x64 = cvo_op_x64_t'(instruction.instruction);
end
// ===| MEMSET uop |============================================================
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_mem_set_uop <= '0;
end else if (IN_memset_op_x64_valid) begin
OUT_mem_set_uop <= '{
dest_cache : dest_cache_e'(memset_op_x64.dest_cache),
dest_addr : memset_op_x64.dest_addr,
a_value : memset_op_x64.a_value,
b_value : memset_op_x64.b_value,
c_value : memset_op_x64.c_value
};
end
end
// ===| MEMCPY route translation ===============================================
// from_device/to_device (1-bit each) → data_route_e (8-bit enum)
data_route_e memcpy_route;
always_comb begin
if (memcpy_op_x64.from_device == FROM_HOST && memcpy_op_x64.to_device == TO_NPU)
memcpy_route = from_host_to_L2;
else
memcpy_route = from_L2_to_host;
end
// ===| LOAD uop — single driver (priority: GEMM > GEMV > MEMCPY > CVO) |======
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_LOAD_uop <= '0;
OUT_sram_rd_start <= 1'b0;
end else begin
OUT_sram_rd_start <= 1'b0; // default: no pulse
if (IN_GEMM_op_x64_valid) begin
OUT_LOAD_uop <= '{
data_dest : from_L2_to_L1_GEMM,
dest_addr : '0,
src_addr : GEMM_op_x64.src_addr,
shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
async : SYNC_OP
};
OUT_sram_rd_start <= 1'b1;
end else if (IN_GEMV_op_x64_valid) begin
OUT_LOAD_uop <= '{
data_dest : from_L2_to_L1_GEMV,
dest_addr : '0,
src_addr : GEMV_op_x64.src_addr,
shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
async : SYNC_OP
};
OUT_sram_rd_start <= 1'b1;
end else if (IN_memcpy_op_x64_valid) begin
OUT_LOAD_uop <= '{
data_dest : memcpy_route,
dest_addr : memcpy_op_x64.dest_addr,
src_addr : memcpy_op_x64.src_addr,
shape_ptr_addr : memcpy_op_x64.shape_ptr_addr,
async : memcpy_op_x64.async
};
end else if (IN_cvo_op_x64_valid) begin
OUT_LOAD_uop <= '{
data_dest : from_L2_to_CVO,
dest_addr : '0,
src_addr : cvo_op_x64.src_addr,
shape_ptr_addr : '0,
async : cvo_op_x64.async
};
end
end
end
// ===| STORE uop — latched at issue time |=====================================
// Held until the engine signals completion (external handshake, not shown here).
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_STORE_uop <= '0;
end else if (IN_GEMM_op_x64_valid) begin
OUT_STORE_uop <= '{
data_dest : from_GEMM_res_to_L2,
dest_addr : GEMM_op_x64.dest_reg,
src_addr : '0,
shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
async : SYNC_OP
};
end else if (IN_GEMV_op_x64_valid) begin
OUT_STORE_uop <= '{
data_dest : from_GEMV_res_to_L2,
dest_addr : GEMV_op_x64.dest_reg,
src_addr : '0,
shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
async : SYNC_OP
};
end else if (IN_cvo_op_x64_valid) begin
OUT_STORE_uop <= '{
data_dest : from_CVO_res_to_L2,
dest_addr : cvo_op_x64.dst_addr,
src_addr : '0,
shape_ptr_addr : '0,
async : cvo_op_x64.async
};
end
end
// ===| GEMM uop |==============================================================
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_GEMM_uop <= '0;
end else if (IN_GEMM_op_x64_valid) begin
OUT_GEMM_uop <= '{
flags : GEMM_op_x64.flags,
size_ptr_addr : GEMM_op_x64.size_ptr_addr,
parallel_lane : GEMM_op_x64.parallel_lane
};
end
end
// ===| GEMV uop |==============================================================
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_GEMV_uop <= '0;
end else if (IN_GEMV_op_x64_valid) begin
OUT_GEMV_uop <= '{
flags : GEMV_op_x64.flags,
size_ptr_addr : GEMV_op_x64.size_ptr_addr,
parallel_lane : GEMV_op_x64.parallel_lane
};
end
end
// ===| CVO uop |===============================================================
always_ff @(posedge clk_core) begin
if (!rst_n_core) begin
OUT_CVO_uop <= '0;
end else if (IN_cvo_op_x64_valid) begin
OUT_CVO_uop <= '{
cvo_func : cvo_func_e'(cvo_op_x64.cvo_func),
src_addr : cvo_op_x64.src_addr,
dst_addr : cvo_op_x64.dst_addr,
length : cvo_op_x64.length,
flags : cvo_flags_t'(cvo_op_x64.flags),
async : cvo_op_x64.async
};
end
end
endmodule
See also
Per-Instruction Dataflow — dependency and completion tracking.