패키지와 상수¶
컴파일 우선순위로 정렬된 SystemVerilog 패키지와 .svh 헤더들.
전체 RTL 이 사용하는 글로벌 타입 시스템, 디바이스 상수, 파이프라인
설정, ISA 레이아웃, SystemVerilog interface 객체를 정의합니다.
더 보기
- pccx ISA 사양
아래
isa_pkg를 뒷받침하는 ISA 사양.
Tier A — 원시 상수 헤더 (.svh)¶
Constants/compilePriority_Order/A_const_svh/ — 모든 하위 패키지가
소비하는 기본 define 모음.
GLOBAL_CONST.svh— 교차 공통parameter.NUMBERS.svh— 숫자 포맷 매개변수화.DEVICE_INFO.svh— 디바이스 패밀리 플래그 추상화.kv260_device.svh— KV260 전용 리소스 카운트 (DSP/BRAM/URAM).npu_arch.svh— NPU 아키텍처 상위 knob (레인 수, 시스톨릭 치수, FIFO 깊이).
GLOBAL_CONST.svh
// ===| DEPRECATED — use npu_arch.svh + kv260_device.svh instead |===============
// This file is kept as a compatibility shim so existing `include "GLOBAL_CONST.svh"
// statements continue to work during the migration period.
// Do NOT add new constants here. Add to npu_arch.svh or kv260_device.svh.
// ===============================================================================
`ifndef GLOBAL_CONST_SVH
`define GLOBAL_CONST_SVH
`include "NUMBERS.svh"
`include "kv260_device.svh"
`include "npu_arch.svh"
// ===| Legacy aliases (kept for backward compatibility) |=======================
// Boolean
`define TRUE 1'b1
`define FALSE 1'b0
// HP weight bus width aliases (used in port declarations of MAT_CORE)
`define HP_PORT_MAX_WIDTH `HP_TOTAL_WIDTH
`define HP_PORT_SINGLE_WIDTH `HP_SINGLE_WIDTH
`define HP_PORT_CNT `DEVICE_HP_PORT_CNT
// DSP48E2 port size aliases (used in GEMM_dsp_unit port declarations)
`define DSP48E2_POUT_SIZE `DSP_P_OUT_WIDTH
`define DSP48E2_A_WIDTH `DEVICE_DSP_A_WIDTH
`define DSP48E2_B_WIDTH `DEVICE_DSP_B_WIDTH
`define PREG_SIZE `DSP_P_OUT_WIDTH
// MAC unit input widths (used in GEMM_systolic parameter defaults)
// H = INT4 weight (4-bit, B-port)
// V = fixed-point mantissa (27-bit, A-port)
`define GEMM_MAC_UNIT_IN_H 4
`define GEMM_MAC_UNIT_IN_V `FIXED_MANT_WIDTH
`endif // GLOBAL_CONST_SVH
NUMBERS.svh
`ifndef NUMBERS_SVH
`define NUMBERS_SVH
// ===| Primitive Type Widths |===================================================
// Used by device_pkg.sv for algorithm-level type selection.
// All values are plain integers — no units, no semantics.
// ===============================================================================
`define N_SIZEOF_INT4 4 // INT4 weight width (bits)
`define N_BF16_SIZE 16 // BF16 activation width (bits)
`define N_FP32_SIZE 32 // FP32 accumulation width (bits)
`endif // NUMBERS_SVH
DEVICE_INFO.svh
// ===| DEPRECATED — use kv260_device.svh instead |==============================
// This file is kept as a compatibility shim only.
// Do NOT add new constants here.
// ===============================================================================
`ifndef DEVICE_INFO_SVH
`define DEVICE_INFO_SVH
`include "kv260_device.svh"
// Legacy aliases (no trailing semicolons — that was a bug)
`define DEVICE_HP_SINGLE_LANE_MAX_IN_BIT `DEVICE_HP_SINGLE_WIDTH_BIT
`define DEVICE_HP_CNT `DEVICE_HP_PORT_CNT
`endif // DEVICE_INFO_SVH
kv260_device.svh
// ===| KV260 Device-Specific Hardware Parameters |==============================
// This file contains ONLY physical hardware constants for the Xilinx Kria KV260.
// To port to a different board, replace this file only.
//
// Board: Xilinx Kria KV260 (Zynq UltraScale+ MPSoC)
// Target: Bare-metal, 400MHz core clock
// ===============================================================================
`ifndef KV260_DEVICE_SVH
`define KV260_DEVICE_SVH
// ===| AXI HP Ports |============================================================
// KV260 has 4 HP ports, each 128-bit wide (max AXI4 stream bandwidth)
`define DEVICE_HP_PORT_CNT 4
`define DEVICE_HP_SINGLE_WIDTH_BIT 128
// ===| AXI HPC / ACP Ports |=====================================================
// ACP: Accelerator Coherency Port — used for FMap in / Result out (128-bit)
`define DEVICE_ACP_WIDTH_BIT 128
// ===| DSP48E2 Resource |=========================================================
// Xilinx UltraScale+ DSP48E2 specifications
// A-port: 30-bit (signed) → used for BF16 fixed-point mantissa (27-bit)
// B-port: 18-bit (signed) → used for INT4 weight (4-bit, packed)
// P-port: 48-bit (signed) → accumulator output
`define DEVICE_DSP_A_WIDTH 30
`define DEVICE_DSP_B_WIDTH 18
`define DEVICE_DSP_P_WIDTH 48
// ===| XPM FIFO Macros |=========================================================
// Xilinx Parameterized Macro (XPM) FIFO default depths
// Elastic buffers on all AXI stream ports to absorb jitter at 400MHz
`define DEVICE_XPM_FIFO_DEPTH 512
`define DEVICE_XPM_FIFO_DEPTH_TINY 16
// ===| BRAM / URAM |=============================================================
// These are design-level capacity choices, but they are technology-bounded
// by the KV260 resource count. Listed here for portability awareness.
// KV260: 144 BRAMs (36Kb each), 64 URAMs (288Kb each)
`define DEVICE_BRAM_WIDTH 36864 // bits per BRAM36
`define DEVICE_URAM_WIDTH 294912 // bits per URAM
`endif // KV260_DEVICE_SVH
npu_arch.svh
// ===| NPU Architecture Macros |=================================================
// NPU-level architectural constants that must be `define (used in port
// declarations and generate ranges, where localparams cannot be used).
//
// These are design choices — they change when the NPU architecture changes,
// not when the board changes. Board-specific values live in kv260_device.svh.
// ===============================================================================
`ifndef NPU_ARCH_SVH
`define NPU_ARCH_SVH
// ===| ISA |=====================================================================
`define ISA_WIDTH 64 // VLIW instruction word width (bits)
`define ISA_OPCODE_WIDTH 4 // Top 4 bits of every instruction
`define ISA_BODY_WIDTH 60 // Instruction body after opcode is stripped
// ISA compilation mode selectors (used by ctrl_npu_decoder and test benches)
`define MOD_X64 1 // 64-bit VLIW mode (active)
`define MOD_X32 0 // 32-bit mode (legacy, unused)
`define U_OPERATION_WIDTH 59 // Usable body bits (ISA_BODY_WIDTH - 1 header)
`define INST_HEAD_ARCH_MOD_BIT 1 // Architecture mode selector bit position
// ===| Systolic Array (Matrix Core) |============================================
`define ARRAY_SIZE_H 32 // Horizontal: number of PE columns
`define ARRAY_SIZE_V 32 // Vertical : number of PE rows
// Total pipeline latency (H + V + overhead), used for e_max delay pipe
`define SYSTOLIC_TOTAL_LATENCY 64
// ===| Data Type Widths (used in port declarations) |===========================
// These must remain `define because they appear in port width expressions.
// Semantic definitions live in dtype_pkg.
// BF16
`define BF16_WIDTH 16
`define BF16_EXP_WIDTH 8
`define BF16_MANT_WIDTH 7
// INT4 (Weight)
`define INT4_WIDTH 4
// Fixed-point mantissa width after BF16 emax alignment
// = BF16_MANT_WIDTH(7) + leading-1 + sign + 18-bit integer headroom = 27
`define FIXED_MANT_WIDTH 27
// FP32 (used for mixed-precision output path)
`define FP32_WIDTH 32
// DSP48E2 accumulator output used as port width
`define DSP_P_OUT_WIDTH 48 // = `DEVICE_DSP_P_WIDTH
// ===| AXI Stream Port Widths |=================================================
`define AXI_STREAM_WIDTH 128 // Standard AXI-S data width used throughout
// HP port aggregated width (all 4 lanes combined for GEMM weight bus)
`define HP_TOTAL_WIDTH 512 // `DEVICE_HP_PORT_CNT * `DEVICE_HP_SINGLE_WIDTH_BIT
`define HP_SINGLE_WIDTH 128 // = `DEVICE_HP_SINGLE_WIDTH_BIT
// ===| Memory / Cache Sizes |===================================================
// FMap L1 cache (SRAM) capacity — depth in units of AXI_STREAM_WIDTH words
`define FMAP_CACHE_DEPTH 2048
`define FMAP_ADDR_WIDTH 11 // log2(FMAP_CACHE_DEPTH)
// ===| DSP48E2 Instruction Modes |==============================================
// Control codes sent along with fmap in the systolic array V-path
`define DSP_IDLE_MOD 2'b00
`define DSP_SYSTOLIC_MOD_P 2'b01 // Normal systolic MAC
`define DSP_GEMV_STATIONARY_MOD 2'b10 // Weight-stationary GEMV
`define DSP_SHIFT_RESULT_MOD 2'b11 // Shift output (for normalization)
// ===| Pipeline Type Selector |=================================================
`define PIPELINE_GEMV 0
`define PIPELINE_GEMM 1
// ===| INT4 Range Helpers |=====================================================
`define INT4_MAX_VAL 7
`define INT4_MIN_VAL -8
`define INT4_RANGE 16
`endif // NPU_ARCH_SVH
Tier B — 디바이스 패키지¶
device_pkg.sv— Tier A define 을import device_pkg::*;로 쓸 수 있는 타입 뷰로 감쌈.
device_pkg.sv
// ===| Device Configuration Package |==========================================
// Selects the data type choices for this target design.
// This is where algorithm-level decisions (precision, pipeline count) are made.
//
// Compilation order: B — depends on A_const_svh (NUMBERS.svh).
// Naming convention: localparam uses PascalCase (linter: parameter-name-style).
// ===============================================================================
package device_pkg;
// ===| Feature Map (Activation) Type |========================================
// FmapType : port-level precision — BF16 (16-bit)
// FmapTypeMixedPrecision: internal accumulation — FP32 (32-bit)
localparam int FmapType = `N_BF16_SIZE;
localparam int FmapTypeMixedPrecision = `N_FP32_SIZE;
// ===| Weight Type |===========================================================
// INT4: 4-bit quantized weight, streamed from HP ports
localparam int WeightType = `N_SIZEOF_INT4;
// ===| Pipeline Instance Counts |==============================================
localparam int VecPipelineCnt = 4; // 4 x muV-Core (Vector Core)
localparam int MatPipelineCnt = 1; // 1 x Matrix Core (32x32 systolic)
// ===| Legacy aliases (snake_case) — keep until all RTL refs updated |=========
localparam int GemvPipelineCnt = VecPipelineCnt;
localparam int GemmPipelineCnt = MatPipelineCnt;
endpackage : device_pkg
Tier C — 타입 패키지¶
dtype_pkg.sv— 스칼라 데이터 타입 (BF16, INT48, flags) typedef.mem_pkg.sv— 메모리 인터페이스 타입 (주소, 포인터 enum).
dtype_pkg.sv
// ===| Data Type Package |=======================================================
// Numeric type constants for all data formats used in uXC.
// Merges and replaces: float_pkg.sv, float_emax_align_pkg.sv
//
// Compilation order: C — depends on A_const_svh, B_device_pkg.
// Naming convention: localparam uses PascalCase.
// ===============================================================================
package dtype_pkg;
// ===| BF16 (Brain Float 16) |=================================================
localparam int Bf16Width = 16; // total bit width
localparam int Bf16ExpWidth = 8; // exponent bits
localparam int Bf16MantWidth = 7; // mantissa bits (stored)
// ===| Fixed-Point Mantissa (post-emax-alignment) |============================
// After BF16 emax alignment:
// bits = 1 (sign) + 1 (implicit leading-1) + 7 (mantissa) + 18 (integer headroom) = 27
// This fits in the DSP48E2 A-port (30-bit signed) with 3 bits to spare.
localparam int FixedMantWidth = 27;
// ===| FP32 |==================================================================
localparam int Fp32Width = 32;
// ===| INT4 (Weight) |=========================================================
localparam int Int4Width = 4;
localparam int Int4Max = 7;
localparam int Int4Min = -8;
localparam int Int4Range = 16;
// ===| INT8 (Activation — W4A8 path) |=========================================
localparam int Int8Width = 8;
// ===| DSP48E2 Accumulator |===================================================
// P-register output: 48-bit signed integer
localparam int DspPWidth = 48;
endpackage : dtype_pkg
mem_pkg.sv
// ===| Memory Architecture Package |============================================
// Derived memory parameters for the uXC NPU.
// All values are computed from device_pkg and kv260_device.svh — no magic numbers.
//
// Compilation order: C — depends on A_const_svh, B_device_pkg.
// Naming convention: localparam uses PascalCase.
// ===============================================================================
package mem_pkg;
// ===| HP Port Configuration |=================================================
// HP ports deliver weights to Vector Core (HP0/1/2) and Matrix Core (HP3).
localparam int HpPortCnt = `DEVICE_HP_PORT_CNT; // 4
localparam int HpSingleWidthBit = `DEVICE_HP_SINGLE_WIDTH_BIT; // 128
// Total aggregated weight bus width (all 4 HP lanes combined)
localparam int HpTotalWidthBit = HpPortCnt * HpSingleWidthBit; // 512
// ===| Weight Count per HP Port Burst |=========================================
// How many INT4 weights arrive per clock per HP port
localparam int WeightBitWidth = device_pkg::WeightType; // 4
localparam int HpSingleWeightCnt = HpSingleWidthBit / WeightBitWidth; // 32
localparam int HpTotalWeightCnt = HpTotalWidthBit / WeightBitWidth; // 128
// ===| L2 Cache / FMap Cache Output Width |=====================================
// Number of fixed-point mantissa values broadcast to the compute array per cycle
// = ARRAY_SIZE_H (one per PE column)
localparam int FmapL2CacheOutCnt = `ARRAY_SIZE_H; // 32
// ===| XPM FIFO Depths |========================================================
localparam int XpmFifoDepth = `DEVICE_XPM_FIFO_DEPTH; // 512
localparam int XpmFifoDepthTiny = `DEVICE_XPM_FIFO_DEPTH_TINY; // 16
// ===| FMap L1 SRAM Cache |=====================================================
localparam int FmapCacheDepth = `FMAP_CACHE_DEPTH; // 2048
localparam int FmapAddrWidth = `FMAP_ADDR_WIDTH; // 11
endpackage : mem_pkg
Tier D — 파이프라인 패키지¶
vec_core_pkg.sv— GEMV 파이프라인 stage count 와 struct.
vec_core_pkg.sv
// ===| Vector Core (muV-Core) Configuration Package |===========================
// Defines the configuration struct and default parameters for the Vector Core.
// The Vector Core consists of 4 parallel muV-Cores, each performing GEMV
// (vector x matrix) operations using INT4 weights and BF16 feature maps.
//
// Replaces: GEMV_const_pkg.sv
// Compilation order: D — depends on A_, B_device_pkg, C_type_pkg.
// Naming convention: localparam uses PascalCase; struct fields use snake_case.
// ===============================================================================
package vec_core_pkg;
// ===| Throughput / Batch Constants |==========================================
// GEMV processes one fmap row (2048-dim) against one weight matrix column.
// Batch size = number of weight rows consumed per invocation.
localparam int Throughput = 1; // output elements per cycle per lane
localparam int GemvBatch = 512; // weight rows processed per call
localparam int GemvCycle = 512; // clock cycles per GEMV call
localparam int GemvLineCnt = mem_pkg::FmapL2CacheOutCnt; // = ARRAY_SIZE_H = 32
// ===| Vector Core Configuration Struct |======================================
// Passed as a parameter to GEMV_top and all sub-modules.
// Use vec_cfg_t instead of raw integers — keeps instantiation self-documenting.
typedef struct packed {
// Pipeline topology
int num_gemv_pipeline; // number of parallel muV-Core lanes (= VecPipelineCnt)
// Throughput
int throughput; // output elements per cycle
int gemv_batch; // weight rows per call
int gemv_cycle; // clocks per call
// Data widths
int fixed_mant_width; // fixed-point mantissa width after emax alignment
int weight_width; // INT4 = 4
int weight_cnt; // weights per HP port per clock (= HpSingleWeightCnt)
// Cache geometry
int fmap_cache_out_cnt; // FMap values broadcast per cycle (= ARRAY_SIZE_H)
int fmap_type_mixed_precision; // output precision (FP32 = 32)
} vec_cfg_t;
// ===| Default Configuration (KV260 / Gemma 3N E4B target) |==================
localparam vec_cfg_t VecCoreDefaultCfg = '{
num_gemv_pipeline: device_pkg::VecPipelineCnt,
throughput: Throughput,
gemv_batch: GemvBatch,
gemv_cycle: GemvCycle,
fixed_mant_width: dtype_pkg::FixedMantWidth,
weight_width: mem_pkg::WeightBitWidth,
weight_cnt: mem_pkg::HpSingleWeightCnt,
fmap_cache_out_cnt: mem_pkg::FmapL2CacheOutCnt,
fmap_type_mixed_precision: device_pkg::FmapTypeMixedPrecision
};
// ===| Legacy type alias |=====================================================
// Old type name: gemv_cfg_t — kept so existing port declarations still compile
// during the migration period. Remove once all GEMV_*.sv files are updated.
typedef vec_cfg_t gemv_cfg_t;
endpackage : vec_core_pkg
ISA 패키지¶
모든 오피코드 · 마이크로 옵 · 명령어 레이아웃의 권위 있는 정의. 모든 컨트롤러 모듈 상단에서 import 합니다.
isa_pkg.sv
// ===| ISA Package |=============================================================
// Master type package for the uCA (micro Compute Architecture) ISA.
// All consumers do `import isa_pkg::*;` — no `include` needed downstream.
//
// Rules:
// - No `include inside this package (Vivado compilation-order constraint).
// - All `define macros that are also needed as port widths live in npu_arch.svh.
// - isa_x32.svh / isa_memctrl.svh / isa_x64.svh are LEGACY — types here supersede them.
//
// Compilation order: after A_const_svh (npu_arch.svh must be included first).
// ===============================================================================
package isa_pkg;
// ===| Basic Address & Control Types |=========================================
typedef logic [16:0] dest_addr_t;
typedef logic [16:0] src_addr_t;
typedef logic [16:0] addr_t;
typedef logic [ 5:0] ptr_addr_t; // shape / size pointer (6-bit index)
typedef logic [ 4:0] parallel_lane_t; // number of active parallel lanes
// MEMSET value fields (16-bit each, per ISA §3.3)
typedef logic [15:0] a_value_t;
typedef logic [15:0] b_value_t;
typedef logic [15:0] c_value_t;
// CVO length (16-bit element count)
typedef logic [15:0] length_t;
// ===| Device Direction Enums |=================================================
typedef enum logic {
FROM_NPU = 1'b0,
FROM_HOST = 1'b1
} from_device_e;
typedef enum logic {
TO_NPU = 1'b0,
TO_HOST = 1'b1
} to_device_e;
typedef enum logic {
SYNC_OP = 1'b0,
ASYNC_OP = 1'b1
} async_e;
// ===| GEMV / GEMM Flags (6-bit, ISA §4) |=====================================
typedef struct packed {
logic findemax; // [5] find & register e_max for output normalisation
logic accm; // [4] accumulate into destination (do not overwrite)
logic w_scale; // [3] apply weight scale factor during MAC
logic [2:0] reserved;
} flags_t;
// ===| Opcode Table (4-bit, ISA §2) |==========================================
typedef enum logic [3:0] {
OP_GEMV = 4'h0,
OP_GEMM = 4'h1,
OP_MEMCPY = 4'h2,
OP_MEMSET = 4'h3,
OP_CVO = 4'h4
} opcode_e;
// ===| Instruction Body (60-bit, opcode already stripped) |====================
typedef logic [59:0] VLIW_instruction_x64;
typedef struct packed {
logic [59:0] instruction;
} instruction_op_x64_t;
// ===| Instruction Encodings (ISA §3) |========================================
// GEMV / GEMM (identical layout, ISA §3.1) — 60 bits
typedef struct packed {
dest_addr_t dest_reg; // [59:43] 17-bit
src_addr_t src_addr; // [42:26] 17-bit
flags_t flags; // [25:20] 6-bit
ptr_addr_t size_ptr_addr; // [19:14] 6-bit
ptr_addr_t shape_ptr_addr; // [13: 8] 6-bit
parallel_lane_t parallel_lane; // [ 7: 3] 5-bit
logic [2:0] reserved; // [ 2: 0] 3-bit
} GEMV_op_x64_t;
typedef GEMV_op_x64_t GEMM_op_x64_t; // same layout
// MEMCPY (ISA §3.2) — 60 bits
typedef struct packed {
from_device_e from_device; // [59] 1-bit
to_device_e to_device; // [58] 1-bit
dest_addr_t dest_addr; // [57:41] 17-bit
src_addr_t src_addr; // [40:24] 17-bit
addr_t aux_addr; // [23: 7] 17-bit
ptr_addr_t shape_ptr_addr; // [ 6: 1] 6-bit
async_e async; // [ 0] 1-bit
} memcpy_op_x64_t;
// MEMSET (ISA §3.3) — 60 bits
typedef struct packed {
logic [1:0] dest_cache; // [59:58] 2-bit
ptr_addr_t dest_addr; // [57:52] 6-bit
a_value_t a_value; // [51:36] 16-bit
b_value_t b_value; // [35:20] 16-bit
c_value_t c_value; // [19: 4] 16-bit
logic [3:0] reserved; // [ 3: 0] 4-bit
} memset_op_x64_t;
// CVO (ISA §3.4) — 60 bits
typedef struct packed {
logic [ 3:0] cvo_func; // [59:56] 4-bit
src_addr_t src_addr; // [55:39] 17-bit
addr_t dst_addr; // [38:22] 17-bit
length_t length; // [21: 6] 16-bit
logic [ 4:0] flags; // [ 5: 1] 5-bit
async_e async; // [ 0] 1-bit
} cvo_op_x64_t;
// ===| CVO Function Codes (ISA §3.4.1) |=======================================
typedef enum logic [3:0] {
CVO_EXP = 4'h0,
CVO_SQRT = 4'h1,
CVO_GELU = 4'h2,
CVO_SIN = 4'h3,
CVO_COS = 4'h4,
CVO_REDUCE_SUM = 4'h5,
CVO_SCALE = 4'h6,
CVO_RECIP = 4'h7
} cvo_func_e;
// ===| CVO Flags (5-bit, ISA §3.4.2) |=========================================
typedef struct packed {
logic sub_emax; // [4] subtract e_max before operation
logic recip_scale; // [3] use reciprocal of scalar (divide instead of multiply)
logic accm; // [2] accumulate into dst
logic [1:0] reserved;
} cvo_flags_t;
// ===| Memory Routing (ISA §5) |================================================
// Each route encodes source[7:4] | dest[3:0] as an 8-bit enum.
typedef enum logic [3:0] {
data_to_host = 4'h0,
data_to_GLOBAL_cache = 4'h1,
data_to_L1_cache_GEMM_in = 4'h2,
data_to_L1_cache_GEMV_in = 4'h3,
data_to_CVO_in = 4'h4
} data_dest_e;
typedef enum logic [3:0] {
data_from_host = 4'h0,
data_from_GLOBAL_cache = 4'h1,
data_from_L1_cache_GEMM_res = 4'h2,
data_from_L1_cache_GEMV_res = 4'h3,
data_from_CVO_res = 4'h4
} data_source_e;
typedef enum logic [7:0] {
from_host_to_L2 = {data_from_host, data_to_GLOBAL_cache },
from_L2_to_host = {data_from_GLOBAL_cache, data_to_host },
from_L2_to_L1_GEMM = {data_from_GLOBAL_cache, data_to_L1_cache_GEMM_in},
from_L2_to_L1_GEMV = {data_from_GLOBAL_cache, data_to_L1_cache_GEMV_in},
from_L2_to_CVO = {data_from_GLOBAL_cache, data_to_CVO_in },
from_GEMV_res_to_L2 = {data_from_L1_cache_GEMV_res, data_to_GLOBAL_cache },
from_GEMM_res_to_L2 = {data_from_L1_cache_GEMM_res, data_to_GLOBAL_cache },
from_CVO_res_to_L2 = {data_from_CVO_res, data_to_GLOBAL_cache }
} data_route_e;
typedef enum logic [1:0] {
data_to_fmap_shape = 2'h0,
data_to_weight_shape = 2'h1
} dest_cache_e;
// ===| Micro-Op Structures (ISA §6) |==========================================
localparam int MemoryUopWidth = 49; // 8+17+17+6+1
// GEMM control uop (ISA §6.1)
typedef struct packed {
flags_t flags;
ptr_addr_t size_ptr_addr;
parallel_lane_t parallel_lane;
} gemm_control_uop_t;
// GEMV control uop (same layout as GEMM)
typedef struct packed {
flags_t flags;
ptr_addr_t size_ptr_addr;
parallel_lane_t parallel_lane;
} GEMV_control_uop_t;
// Memory control uop (ISA §6.2)
typedef struct packed {
data_route_e data_dest; // 8-bit
dest_addr_t dest_addr; // 17-bit
src_addr_t src_addr; // 17-bit
ptr_addr_t shape_ptr_addr; // 6-bit
async_e async; // 1-bit
} memory_control_uop_t;
// Memory set uop (ISA §6.3)
typedef struct packed {
dest_cache_e dest_cache; // 2-bit
ptr_addr_t dest_addr; // 6-bit
a_value_t a_value; // 16-bit
b_value_t b_value; // 16-bit
c_value_t c_value; // 16-bit
} memory_set_uop_t;
// CVO control uop (ISA §6.4)
typedef struct packed {
cvo_func_e cvo_func; // 4-bit
src_addr_t src_addr; // 17-bit
addr_t dst_addr; // 17-bit
length_t length; // 16-bit
cvo_flags_t flags; // 5-bit
async_e async; // 1-bit
} cvo_control_uop_t;
// ===| ACP / NPU Transfer uops (used by mem_dispatcher) |======================
typedef struct packed {
logic write_en;
logic [16:0] base_addr;
logic [16:0] end_addr;
} acp_uop_t; // 35-bit
typedef struct packed {
logic write_en;
logic [16:0] base_addr;
logic [16:0] end_addr;
} npu_uop_t; // 35-bit
endpackage : isa_pkg
isa_memctrl.svh
package isa_memctrl;
`define PORT_MOD_E_WRITE 1
`define PORT_MOD_E_READ 0
typedef enum logic [3:0] {
data_to_host = 4'h0,
data_to_GLOBAL_cache = 4'h1,
data_to_L1_cache_gemm_in = 4'h2,
data_to_L1_cache_GEMV_in = 4'h3,
} data_dest_e;
typedef enum logic [3:0] {
data_from_host = 4'h0,
data_from_GLOBAL_cache = 4'h1,
data_from_L1_cache_gemm_res = 4'h2,
data_from_L1_cache_GEMV_res = 4'h3
} data_source_e;
typedef enum logic [7:0] {
from_host_to_L2 = {data_from_host, data_to_GLOBAL_cache},
from_L2_to_host = {data_from_GLOBAL_cache, data_to_host},
from_L2_to_L1_GEMM = {data_from_GLOBAL_cache, data_to_L1_cache_GEMM_in},
from_L2_to_L1_GEMV = {data_from_GLOBAL_cache, data_to_L1_cache_GEMV_in},
from_GEMV_res_to_L2 = {data_from_L1_cache_GEMV_res, data_to_GLOBAL_cache},
from_GEMM_res_to_L2 = {data_from_L1_cache_GEMM_res, data_to_GLOBAL_cache}
} data_route_e;
typedef struct packed {
data_route_e data_dest;
dest_addr_t dest_addr;
src_addr_t src_addr;
ptr_addr_t shape_ptr_addr;
async_e async;
} memory_control_uop_t;
typedef enum logic [1:0] {
data_to_fmap_shape = 2'h0,
data_to_weight_shape = 2'h1
} dest_cache_e;
typedef struct packed {
dest_cache_e dest_cache;
ptr_addr_t dest_addr;
a_value_t a_value;
b_value_t b_value;
c_value_t c_value;
} memory_set_uop_t;
// mem dispatcher.sv
typedef enum logic {
NPU_U_OP_WIDTH = 33,
ACP_U_OP_WIDTH = 33
} npu_acp_u_op_width_e;
typedef struct packed {
logic acp_write_en_wire;
logic [16:0] acp_base_addr_wire;
logic [16:0] acp_end_addr;
} acp_uop_t; //33 bit == [32:0]
typedef struct packed {
logic npu_write_en_wire;
logic [16:0] npu_base_addr_wire;
logic [16:0] npu_end_addr;
} npu_uop_t; //33 bit == [32:0]
endpackage
isa_x32.svh
package isa_x32;
`define X32_HEADSIZE 6
typedef logic [16:0] dest_addr_t;
typedef logic [7:0] loop_cnt_t;
typedef struct packed {
logic [31:0] data;
logic [3:0] byte_en;
} x32_payload_t;
/*─────────────────────────────────────────────
Opcode table
─────────────────────────────────────────────*/
typedef enum logic [4:0] {
OP_GEMV = 4'h0,
OP_GEMM = 4'h1,
OP_MEMCPY = 4'h2
} opcode_t;
typedef struct packed {
logic to_divice;
dest_addr_t dest_addr;
loop_cnt_t loop_cnt;
} payload_memcpy_t;
/*─────────────────────────────────────────────
Full 32-bit instruction word
Fixed header (6b) + union payload (26b)
─────────────────────────────────────────────*/
typedef struct packed {
logic [1:0] cmd_chaining;
opcode_t opcode;
union packed {
payload_dotm_t dotm; // V dot M / M dot M
payload_memcpy_t memcpy; // memcpy
override_memcpy_t override_memcpy;
override_chain_memcpy_t override_chain_memcpy;
logic [25:0] raw;
} payload; // [25:0]
} instruction_x32_t;
//Deprecated
typedef struct packed {
logic to_divice;
dest_addr_t dest_addr;
logic [7:0] loop_cnt;
} memory_uop_x32_t;
endpackage
isa_x64.svh
package isa_x64;
// Basic Types
typedef logic [16:0] dest_addr_t;
typedef logic [16:0] src_addr_t;
typedef logic [16:0] addr_t;
typedef logic [5:0] ptr_addr_t; // For size and shape pointers
typedef logic [4:0] parallel_lane_t;
typedef logic [2:0] reserved_dot;
typedef struct packed {
logic [63:0] data;
logic [7:0] byte_en;
} x64_payload_t;
// npu -> host
// host -> npu
typedef enum logic {
FROM_NPU = 1'b0,
FROM_HOST = 1'b1
} from_device_e;
typedef enum logic {
TO_NPU = 1'b0,
TO_HOST = 1'b1
} to_device_e;
typedef enum logic {
sync = 1'b0,
async = 1'b1
} async_e;
// Flags (6-bit as per PDF spec)
typedef struct packed {
logic findemax;
logic accm; // Accumulate
logic w_scale;
logic [2:0] reserved;
} flags_t;
// Instruction format
// instruction = x64(64bit) - head(opcode)
typedef logic [59:0] VLIW_instruction_x64;
// Opcode table (4-bit)
typedef enum logic [3:0] {
OP_GEMV = 4'h0,
OP_GEMM = 4'h1,
OP_MEMCPY = 4'h2,
OP_MEMSET = 4'h3
} opcode_e;
typedef struct packed {
dest_addr_t dest_reg;
src_addr_t src_addr;
flags_t flags;
ptr_addr_t size_ptr_addr;
ptr_addr_t shape_ptr_addr;
parallel_lane_t parallel_lane;
reserved_dot reserved;
} GEMV_op_x64_t;
typedef struct packed {
dest_addr_t dest_reg;
src_addr_t src_addr;
flags_t flags;
ptr_addr_t size_ptr_addr;
ptr_addr_t shape_ptr_addr;
parallel_lane_t parallel_lane;
reserved_dot reserved;
} GEMM_op_x64_t;
typedef struct packed {
from_device_e from_device;
to_device_e to_device;
dest_addr_t dest_addr;
src_addr_t src_addr;
addr_t _addr;
ptr_addr_t shape_ptr_addr;
async_e async;
} memcpy_op_x64_t;
typedef struct packed {
logic [1:0] dest_cache;
ptr_addr_t dest_addr;
a_value_t a_value;
b_value_t b_value;
c_value_t c_value;
logic reserved;
} memset_op_x64_t;
typedef struct packed {
// head(opcode) removed
logic [59:0] instruction;
} instruction_op_x64_t;
// --------------------------------------------------------
// ===| Compute Micro-Op |=================================
`define MEMORY_UOP_WIDTH 49
typedef struct packed {
flags_t flags;
ptr_addr_t size_ptr_addr;
parallel_lane_t parallel_lane;
} gemm_control_uop_t;
typedef struct packed {
flags_t flags;
ptr_addr_t size_ptr_addr;
parallel_lane_t parallel_lane;
} GEMV_control_uop_t;
// ===| Compute Micro-Op |====================================
// -----------------------------------------------------------
endpackage
인터페이스 정의¶
npu_interfaces.svh— 블록 간 typed handle 로 쓰이는 SystemVeriloginterface블록들.
npu_interfaces.svh
`include "GLOBAL_CONST.svh"
`ifndef NPU_INTERFACES_SVH
`define NPU_INTERFACES_SVH
interface axis_if #(
parameter DATA_WIDTH = 128
) ();
logic [ DATA_WIDTH-1:0] tdata;
logic tvalid;
logic tready;
logic tlast;
logic [(DATA_WIDTH/8)-1:0] tkeep;
// Slave Side (NPU Perspective: Input)
modport slave(input tdata, tvalid, tlast, tkeep, output tready);
// Master Side (NPU Perspective: Output)
modport master(output tdata, tvalid, tlast, tkeep, input tready);
endinterface
// axil_if.sv
interface axil_if #(
parameter int ADDR_W = 12,
parameter int DATA_W = 64
) (
input logic clk,
input logic rst_n
);
// AW Channel
logic [ADDR_W-1:0] awaddr;
logic [ 2:0] awprot;
logic awvalid, awready;
// W Channel
logic [ DATA_W-1:0] wdata;
logic [(DATA_W/8)-1:0] wstrb;
logic wvalid, wready;
// B Channel
logic [1:0] bresp;
logic bvalid, bready;
// AR Channel
logic [ADDR_W-1:0] araddr;
logic [ 2:0] arprot;
logic arvalid, arready;
// R Channel
logic [DATA_W-1:0] rdata;
logic [ 1:0] rresp;
logic rvalid, rready;
modport slave(
input awaddr, awprot, awvalid, wdata, wstrb, wvalid, bready,
input araddr, arprot, arvalid, rready,
output awready, wready, bresp, bvalid, arready, rdata, rresp, rvalid
);
modport master(
output awaddr, awprot, awvalid, wdata, wstrb, wvalid, bready,
output araddr, arprot, arvalid, rready,
input awready, wready, bresp, bvalid, arready, rdata, rresp, rvalid
);
endinterface
`endif // NPU_INTERFACES_SVH