Packages and Constants

Compile-order-ordered SystemVerilog packages and .svh headers that establish the global type system, device-specific constants, pipeline configuration, ISA layout, and SystemVerilog interface objects used across the rest of the RTL.

See also

pccx ISA Specification

Human-readable ISA specification backing isa_pkg below.

Tier A — Raw constant headers (.svh)

The Constants/compilePriority_Order/A_const_svh/ directory — primitive defines consumed by every downstream package.

  • GLOBAL_CONST.svh — cross-cutting parameter set.

  • NUMBERS.svh — numeric-format parameterization.

  • DEVICE_INFO.svh — abstract device-family flags.

  • kv260_device.svh — KV260-specific resource counts (DSP / BRAM / URAM).

  • npu_arch.svh — top-level NPU architecture knobs (lane counts, systolic dimensions, FIFO depths).

GLOBAL_CONST.svh
// ===| DEPRECATED — use npu_arch.svh + kv260_device.svh instead |===============
// This file is kept as a compatibility shim so existing `include "GLOBAL_CONST.svh"
// statements continue to work during the migration period.
// Do NOT add new constants here. Add to npu_arch.svh or kv260_device.svh.
// ===============================================================================

`ifndef GLOBAL_CONST_SVH
`define GLOBAL_CONST_SVH

`include "NUMBERS.svh"
`include "kv260_device.svh"
`include "npu_arch.svh"

// ===| Legacy aliases (kept for backward compatibility) |=======================

// Boolean
`define TRUE  1'b1
`define FALSE 1'b0

// HP weight bus width aliases (used in port declarations of MAT_CORE)
`define HP_PORT_MAX_WIDTH    `HP_TOTAL_WIDTH
`define HP_PORT_SINGLE_WIDTH `HP_SINGLE_WIDTH
`define HP_PORT_CNT          `DEVICE_HP_PORT_CNT

// DSP48E2 port size aliases (used in GEMM_dsp_unit port declarations)
`define DSP48E2_POUT_SIZE    `DSP_P_OUT_WIDTH
`define DSP48E2_A_WIDTH      `DEVICE_DSP_A_WIDTH
`define DSP48E2_B_WIDTH      `DEVICE_DSP_B_WIDTH
`define PREG_SIZE            `DSP_P_OUT_WIDTH

// MAC unit input widths (used in GEMM_systolic parameter defaults)
// H = INT4 weight (4-bit, B-port)
// V = fixed-point mantissa (27-bit, A-port)
`define GEMM_MAC_UNIT_IN_H   4
`define GEMM_MAC_UNIT_IN_V   `FIXED_MANT_WIDTH

`endif // GLOBAL_CONST_SVH
NUMBERS.svh
`ifndef NUMBERS_SVH
`define NUMBERS_SVH

// ===| Primitive Type Widths |===================================================
// Used by device_pkg.sv for algorithm-level type selection.
// All values are plain integers — no units, no semantics.
// ===============================================================================

`define N_SIZEOF_INT4   4   // INT4 weight width (bits)
`define N_BF16_SIZE    16   // BF16 activation width (bits)
`define N_FP32_SIZE    32   // FP32 accumulation width (bits)

`endif // NUMBERS_SVH
DEVICE_INFO.svh
// ===| DEPRECATED — use kv260_device.svh instead |==============================
// This file is kept as a compatibility shim only.
// Do NOT add new constants here.
// ===============================================================================

`ifndef DEVICE_INFO_SVH
`define DEVICE_INFO_SVH

`include "kv260_device.svh"

// Legacy aliases (no trailing semicolons — that was a bug)
`define DEVICE_HP_SINGLE_LANE_MAX_IN_BIT  `DEVICE_HP_SINGLE_WIDTH_BIT
`define DEVICE_HP_CNT                     `DEVICE_HP_PORT_CNT

`endif // DEVICE_INFO_SVH
kv260_device.svh
// ===| KV260 Device-Specific Hardware Parameters |==============================
// This file contains ONLY physical hardware constants for the Xilinx Kria KV260.
// To port to a different board, replace this file only.
//
// Board: Xilinx Kria KV260 (Zynq UltraScale+ MPSoC)
// Target: Bare-metal, 400MHz core clock
// ===============================================================================

`ifndef KV260_DEVICE_SVH
`define KV260_DEVICE_SVH

// ===| AXI HP Ports |============================================================
// KV260 has 4 HP ports, each 128-bit wide (max AXI4 stream bandwidth)
`define DEVICE_HP_PORT_CNT          4
`define DEVICE_HP_SINGLE_WIDTH_BIT  128

// ===| AXI HPC / ACP Ports |=====================================================
// ACP: Accelerator Coherency Port — used for FMap in / Result out (128-bit)
`define DEVICE_ACP_WIDTH_BIT        128

// ===| DSP48E2 Resource |=========================================================
// Xilinx UltraScale+ DSP48E2 specifications
// A-port:  30-bit  (signed)  → used for BF16 fixed-point mantissa (27-bit)
// B-port:  18-bit  (signed)  → used for INT4 weight (4-bit, packed)
// P-port:  48-bit  (signed)  → accumulator output
`define DEVICE_DSP_A_WIDTH          30
`define DEVICE_DSP_B_WIDTH          18
`define DEVICE_DSP_P_WIDTH          48

// ===| XPM FIFO Macros |=========================================================
// Xilinx Parameterized Macro (XPM) FIFO default depths
// Elastic buffers on all AXI stream ports to absorb jitter at 400MHz
`define DEVICE_XPM_FIFO_DEPTH       512
`define DEVICE_XPM_FIFO_DEPTH_TINY  16

// ===| BRAM / URAM |=============================================================
// These are design-level capacity choices, but they are technology-bounded
// by the KV260 resource count. Listed here for portability awareness.
// KV260: 144 BRAMs (36Kb each), 64 URAMs (288Kb each)
`define DEVICE_BRAM_WIDTH           36864   // bits per BRAM36
`define DEVICE_URAM_WIDTH           294912  // bits per URAM

`endif // KV260_DEVICE_SVH
npu_arch.svh
// ===| NPU Architecture Macros |=================================================
// NPU-level architectural constants that must be `define (used in port
// declarations and generate ranges, where localparams cannot be used).
//
// These are design choices — they change when the NPU architecture changes,
// not when the board changes. Board-specific values live in kv260_device.svh.
// ===============================================================================

`ifndef NPU_ARCH_SVH
`define NPU_ARCH_SVH

// ===| ISA |=====================================================================
`define ISA_WIDTH                   64   // VLIW instruction word width (bits)
`define ISA_OPCODE_WIDTH             4   // Top 4 bits of every instruction
`define ISA_BODY_WIDTH              60   // Instruction body after opcode is stripped

// ISA compilation mode selectors (used by ctrl_npu_decoder and test benches)
`define MOD_X64                      1   // 64-bit VLIW mode (active)
`define MOD_X32                      0   // 32-bit mode (legacy, unused)
`define U_OPERATION_WIDTH           59   // Usable body bits (ISA_BODY_WIDTH - 1 header)
`define INST_HEAD_ARCH_MOD_BIT       1   // Architecture mode selector bit position

// ===| Systolic Array (Matrix Core) |============================================
`define ARRAY_SIZE_H                32   // Horizontal: number of PE columns
`define ARRAY_SIZE_V                32   // Vertical  : number of PE rows
// Total pipeline latency (H + V + overhead), used for e_max delay pipe
`define SYSTOLIC_TOTAL_LATENCY      64

// ===| Data Type Widths (used in port declarations) |===========================
// These must remain `define because they appear in port width expressions.
// Semantic definitions live in dtype_pkg.

// BF16
`define BF16_WIDTH                  16
`define BF16_EXP_WIDTH              8
`define BF16_MANT_WIDTH             7

// INT4 (Weight)
`define INT4_WIDTH                  4

// Fixed-point mantissa width after BF16 emax alignment
// = BF16_MANT_WIDTH(7) + leading-1 + sign + 18-bit integer headroom = 27
`define FIXED_MANT_WIDTH            27

// FP32 (used for mixed-precision output path)
`define FP32_WIDTH                  32

// DSP48E2 accumulator output used as port width
`define DSP_P_OUT_WIDTH             48   // = `DEVICE_DSP_P_WIDTH

// ===| AXI Stream Port Widths |=================================================
`define AXI_STREAM_WIDTH            128  // Standard AXI-S data width used throughout

// HP port aggregated width (all 4 lanes combined for GEMM weight bus)
`define HP_TOTAL_WIDTH              512  // `DEVICE_HP_PORT_CNT * `DEVICE_HP_SINGLE_WIDTH_BIT
`define HP_SINGLE_WIDTH             128  // = `DEVICE_HP_SINGLE_WIDTH_BIT

// ===| Memory / Cache Sizes |===================================================
// FMap L1 cache (SRAM) capacity — depth in units of AXI_STREAM_WIDTH words
`define FMAP_CACHE_DEPTH            2048
`define FMAP_ADDR_WIDTH             11   // log2(FMAP_CACHE_DEPTH)

// ===| DSP48E2 Instruction Modes |==============================================
// Control codes sent along with fmap in the systolic array V-path
`define DSP_IDLE_MOD                2'b00
`define DSP_SYSTOLIC_MOD_P          2'b01   // Normal systolic MAC
`define DSP_GEMV_STATIONARY_MOD     2'b10   // Weight-stationary GEMV
`define DSP_SHIFT_RESULT_MOD        2'b11   // Shift output (for normalization)

// ===| Pipeline Type Selector |=================================================
`define PIPELINE_GEMV               0
`define PIPELINE_GEMM               1

// ===| INT4 Range Helpers |=====================================================
`define INT4_MAX_VAL                7
`define INT4_MIN_VAL                -8
`define INT4_RANGE                  16

`endif // NPU_ARCH_SVH

Tier B — Device package

  • device_pkg.sv — typed views of the Tier A defines suitable for import device_pkg::*;.

device_pkg.sv
// ===| Device Configuration Package |==========================================
// Selects the data type choices for this target design.
// This is where algorithm-level decisions (precision, pipeline count) are made.
//
// Compilation order: B — depends on A_const_svh (NUMBERS.svh).
// Naming convention: localparam uses PascalCase (linter: parameter-name-style).
// ===============================================================================

package device_pkg;

  // ===| Feature Map (Activation) Type |========================================
  // FmapType             : port-level precision  — BF16 (16-bit)
  // FmapTypeMixedPrecision: internal accumulation — FP32 (32-bit)
  localparam int FmapType                 = `N_BF16_SIZE;
  localparam int FmapTypeMixedPrecision   = `N_FP32_SIZE;

  // ===| Weight Type |===========================================================
  // INT4: 4-bit quantized weight, streamed from HP ports
  localparam int WeightType               = `N_SIZEOF_INT4;

  // ===| Pipeline Instance Counts |==============================================
  localparam int VecPipelineCnt           = 4;  // 4 x muV-Core (Vector Core)
  localparam int MatPipelineCnt           = 1;  // 1 x Matrix Core (32x32 systolic)

  // ===| Legacy aliases (snake_case) — keep until all RTL refs updated |=========
  localparam int GemvPipelineCnt          = VecPipelineCnt;
  localparam int GemmPipelineCnt          = MatPipelineCnt;

endpackage : device_pkg

Tier C — Type packages

  • dtype_pkg.sv — scalar data-type typedefs (BF16, INT48, flags).

  • mem_pkg.sv — memory-interface types (addresses, ptr enums).

dtype_pkg.sv
// ===| Data Type Package |=======================================================
// Numeric type constants for all data formats used in uXC.
// Merges and replaces: float_pkg.sv, float_emax_align_pkg.sv
//
// Compilation order: C — depends on A_const_svh, B_device_pkg.
// Naming convention: localparam uses PascalCase.
// ===============================================================================

package dtype_pkg;

  // ===| BF16 (Brain Float 16) |=================================================
  localparam int Bf16Width         = 16;  // total bit width
  localparam int Bf16ExpWidth      = 8;   // exponent bits
  localparam int Bf16MantWidth     = 7;   // mantissa bits (stored)

  // ===| Fixed-Point Mantissa (post-emax-alignment) |============================
  // After BF16 emax alignment:
  //   bits = 1 (sign) + 1 (implicit leading-1) + 7 (mantissa) + 18 (integer headroom) = 27
  // This fits in the DSP48E2 A-port (30-bit signed) with 3 bits to spare.
  localparam int FixedMantWidth    = 27;

  // ===| FP32 |==================================================================
  localparam int Fp32Width         = 32;

  // ===| INT4 (Weight) |=========================================================
  localparam int Int4Width         = 4;
  localparam int Int4Max           = 7;
  localparam int Int4Min           = -8;
  localparam int Int4Range         = 16;

  // ===| INT8 (Activation — W4A8 path) |=========================================
  localparam int Int8Width         = 8;

  // ===| DSP48E2 Accumulator |===================================================
  // P-register output: 48-bit signed integer
  localparam int DspPWidth         = 48;

endpackage : dtype_pkg
mem_pkg.sv
// ===| Memory Architecture Package |============================================
// Derived memory parameters for the uXC NPU.
// All values are computed from device_pkg and kv260_device.svh — no magic numbers.
//
// Compilation order: C — depends on A_const_svh, B_device_pkg.
// Naming convention: localparam uses PascalCase.
// ===============================================================================

package mem_pkg;

  // ===| HP Port Configuration |=================================================
  // HP ports deliver weights to Vector Core (HP0/1/2) and Matrix Core (HP3).
  localparam int HpPortCnt            = `DEVICE_HP_PORT_CNT;        // 4
  localparam int HpSingleWidthBit     = `DEVICE_HP_SINGLE_WIDTH_BIT; // 128

  // Total aggregated weight bus width (all 4 HP lanes combined)
  localparam int HpTotalWidthBit      = HpPortCnt * HpSingleWidthBit; // 512

  // ===| Weight Count per HP Port Burst |=========================================
  // How many INT4 weights arrive per clock per HP port
  localparam int WeightBitWidth       = device_pkg::WeightType;       // 4
  localparam int HpSingleWeightCnt    = HpSingleWidthBit / WeightBitWidth; // 32
  localparam int HpTotalWeightCnt     = HpTotalWidthBit  / WeightBitWidth; // 128

  // ===| L2 Cache / FMap Cache Output Width |=====================================
  // Number of fixed-point mantissa values broadcast to the compute array per cycle
  // = ARRAY_SIZE_H (one per PE column)
  localparam int FmapL2CacheOutCnt    = `ARRAY_SIZE_H;   // 32

  // ===| XPM FIFO Depths |========================================================
  localparam int XpmFifoDepth         = `DEVICE_XPM_FIFO_DEPTH;      // 512
  localparam int XpmFifoDepthTiny     = `DEVICE_XPM_FIFO_DEPTH_TINY; // 16

  // ===| FMap L1 SRAM Cache |=====================================================
  localparam int FmapCacheDepth       = `FMAP_CACHE_DEPTH;   // 2048
  localparam int FmapAddrWidth        = `FMAP_ADDR_WIDTH;     // 11

endpackage : mem_pkg

Tier D — Pipeline package

  • vec_core_pkg.sv — GEMV pipeline stage counts / structs.

vec_core_pkg.sv
// ===| Vector Core (muV-Core) Configuration Package |===========================
// Defines the configuration struct and default parameters for the Vector Core.
// The Vector Core consists of 4 parallel muV-Cores, each performing GEMV
// (vector x matrix) operations using INT4 weights and BF16 feature maps.
//
// Replaces: GEMV_const_pkg.sv
// Compilation order: D — depends on A_, B_device_pkg, C_type_pkg.
// Naming convention: localparam uses PascalCase; struct fields use snake_case.
// ===============================================================================

package vec_core_pkg;

  // ===| Throughput / Batch Constants |==========================================
  // GEMV processes one fmap row (2048-dim) against one weight matrix column.
  // Batch size = number of weight rows consumed per invocation.
  localparam int Throughput       = 1;    // output elements per cycle per lane
  localparam int GemvBatch        = 512;  // weight rows processed per call
  localparam int GemvCycle        = 512;  // clock cycles per GEMV call
  localparam int GemvLineCnt      = mem_pkg::FmapL2CacheOutCnt; // = ARRAY_SIZE_H = 32

  // ===| Vector Core Configuration Struct |======================================
  // Passed as a parameter to GEMV_top and all sub-modules.
  // Use vec_cfg_t instead of raw integers — keeps instantiation self-documenting.
  typedef struct packed {
    // Pipeline topology
    int num_gemv_pipeline;     // number of parallel muV-Core lanes (= VecPipelineCnt)

    // Throughput
    int throughput;            // output elements per cycle
    int gemv_batch;            // weight rows per call
    int gemv_cycle;            // clocks per call

    // Data widths
    int fixed_mant_width;      // fixed-point mantissa width after emax alignment
    int weight_width;          // INT4 = 4
    int weight_cnt;            // weights per HP port per clock (= HpSingleWeightCnt)

    // Cache geometry
    int fmap_cache_out_cnt;    // FMap values broadcast per cycle (= ARRAY_SIZE_H)
    int fmap_type_mixed_precision; // output precision (FP32 = 32)
  } vec_cfg_t;

  // ===| Default Configuration (KV260 / Gemma 3N E4B target) |==================
  localparam vec_cfg_t VecCoreDefaultCfg = '{
    num_gemv_pipeline:          device_pkg::VecPipelineCnt,

    throughput:                 Throughput,
    gemv_batch:                 GemvBatch,
    gemv_cycle:                 GemvCycle,

    fixed_mant_width:           dtype_pkg::FixedMantWidth,
    weight_width:               mem_pkg::WeightBitWidth,
    weight_cnt:                 mem_pkg::HpSingleWeightCnt,

    fmap_cache_out_cnt:         mem_pkg::FmapL2CacheOutCnt,
    fmap_type_mixed_precision:  device_pkg::FmapTypeMixedPrecision
  };

  // ===| Legacy type alias |=====================================================
  // Old type name: gemv_cfg_t — kept so existing port declarations still compile
  // during the migration period. Remove once all GEMV_*.sv files are updated.
  typedef vec_cfg_t gemv_cfg_t;

endpackage : vec_core_pkg

ISA package

The authoritative definition of every opcode, micro-op, and instruction layout. Imported at the top of every controller module.

isa_pkg.sv
// ===| ISA Package |=============================================================
// Master type package for the uCA (micro Compute Architecture) ISA.
// All consumers do `import isa_pkg::*;` — no `include` needed downstream.
//
// Rules:
//   - No `include inside this package (Vivado compilation-order constraint).
//   - All `define macros that are also needed as port widths live in npu_arch.svh.
//   - isa_x32.svh / isa_memctrl.svh / isa_x64.svh are LEGACY — types here supersede them.
//
// Compilation order: after A_const_svh (npu_arch.svh must be included first).
// ===============================================================================

package isa_pkg;

  // ===| Basic Address & Control Types |=========================================
  typedef logic [16:0] dest_addr_t;
  typedef logic [16:0] src_addr_t;
  typedef logic [16:0] addr_t;
  typedef logic [ 5:0] ptr_addr_t;       // shape / size pointer (6-bit index)
  typedef logic [ 4:0] parallel_lane_t;  // number of active parallel lanes

  // MEMSET value fields (16-bit each, per ISA §3.3)
  typedef logic [15:0] a_value_t;
  typedef logic [15:0] b_value_t;
  typedef logic [15:0] c_value_t;

  // CVO length (16-bit element count)
  typedef logic [15:0] length_t;

  // ===| Device Direction Enums |=================================================
  typedef enum logic {
    FROM_NPU  = 1'b0,
    FROM_HOST = 1'b1
  } from_device_e;

  typedef enum logic {
    TO_NPU  = 1'b0,
    TO_HOST = 1'b1
  } to_device_e;

  typedef enum logic {
    SYNC_OP  = 1'b0,
    ASYNC_OP = 1'b1
  } async_e;

  // ===| GEMV / GEMM Flags (6-bit, ISA §4) |=====================================
  typedef struct packed {
    logic findemax;   // [5] find & register e_max for output normalisation
    logic accm;       // [4] accumulate into destination (do not overwrite)
    logic w_scale;    // [3] apply weight scale factor during MAC
    logic [2:0] reserved;
  } flags_t;

  // ===| Opcode Table (4-bit, ISA §2) |==========================================
  typedef enum logic [3:0] {
    OP_GEMV   = 4'h0,
    OP_GEMM   = 4'h1,
    OP_MEMCPY = 4'h2,
    OP_MEMSET = 4'h3,
    OP_CVO    = 4'h4
  } opcode_e;

  // ===| Instruction Body (60-bit, opcode already stripped) |====================
  typedef logic [59:0] VLIW_instruction_x64;

  typedef struct packed {
    logic [59:0] instruction;
  } instruction_op_x64_t;

  // ===| Instruction Encodings (ISA §3) |========================================

  // GEMV / GEMM  (identical layout, ISA §3.1)  — 60 bits
  typedef struct packed {
    dest_addr_t     dest_reg;        // [59:43] 17-bit
    src_addr_t      src_addr;        // [42:26] 17-bit
    flags_t         flags;           // [25:20]  6-bit
    ptr_addr_t      size_ptr_addr;   // [19:14]  6-bit
    ptr_addr_t      shape_ptr_addr;  // [13: 8]  6-bit
    parallel_lane_t parallel_lane;   // [ 7: 3]  5-bit
    logic [2:0]     reserved;        // [ 2: 0]  3-bit
  } GEMV_op_x64_t;

  typedef GEMV_op_x64_t GEMM_op_x64_t;  // same layout

  // MEMCPY  (ISA §3.2)  — 60 bits
  typedef struct packed {
    from_device_e from_device;    // [59]      1-bit
    to_device_e   to_device;      // [58]      1-bit
    dest_addr_t   dest_addr;      // [57:41]  17-bit
    src_addr_t    src_addr;       // [40:24]  17-bit
    addr_t        aux_addr;       // [23: 7]  17-bit
    ptr_addr_t    shape_ptr_addr; // [ 6: 1]   6-bit
    async_e       async;          // [ 0]      1-bit
  } memcpy_op_x64_t;

  // MEMSET  (ISA §3.3)  — 60 bits
  typedef struct packed {
    logic [1:0] dest_cache;  // [59:58]  2-bit
    ptr_addr_t  dest_addr;   // [57:52]  6-bit
    a_value_t   a_value;     // [51:36] 16-bit
    b_value_t   b_value;     // [35:20] 16-bit
    c_value_t   c_value;     // [19: 4] 16-bit
    logic [3:0] reserved;    // [ 3: 0]  4-bit
  } memset_op_x64_t;

  // CVO  (ISA §3.4)  — 60 bits
  typedef struct packed {
    logic [ 3:0] cvo_func;   // [59:56]  4-bit
    src_addr_t   src_addr;   // [55:39] 17-bit
    addr_t       dst_addr;   // [38:22] 17-bit
    length_t     length;     // [21: 6] 16-bit
    logic [ 4:0] flags;      // [ 5: 1]  5-bit
    async_e      async;      // [ 0]     1-bit
  } cvo_op_x64_t;

  // ===| CVO Function Codes (ISA §3.4.1) |=======================================
  typedef enum logic [3:0] {
    CVO_EXP        = 4'h0,
    CVO_SQRT       = 4'h1,
    CVO_GELU       = 4'h2,
    CVO_SIN        = 4'h3,
    CVO_COS        = 4'h4,
    CVO_REDUCE_SUM = 4'h5,
    CVO_SCALE      = 4'h6,
    CVO_RECIP      = 4'h7
  } cvo_func_e;

  // ===| CVO Flags (5-bit, ISA §3.4.2) |=========================================
  typedef struct packed {
    logic sub_emax;     // [4] subtract e_max before operation
    logic recip_scale;  // [3] use reciprocal of scalar (divide instead of multiply)
    logic accm;         // [2] accumulate into dst
    logic [1:0] reserved;
  } cvo_flags_t;

  // ===| Memory Routing (ISA §5) |================================================
  // Each route encodes source[7:4] | dest[3:0] as an 8-bit enum.

  typedef enum logic [3:0] {
    data_to_host             = 4'h0,
    data_to_GLOBAL_cache     = 4'h1,
    data_to_L1_cache_GEMM_in = 4'h2,
    data_to_L1_cache_GEMV_in = 4'h3,
    data_to_CVO_in           = 4'h4
  } data_dest_e;

  typedef enum logic [3:0] {
    data_from_host              = 4'h0,
    data_from_GLOBAL_cache      = 4'h1,
    data_from_L1_cache_GEMM_res = 4'h2,
    data_from_L1_cache_GEMV_res = 4'h3,
    data_from_CVO_res           = 4'h4
  } data_source_e;

  typedef enum logic [7:0] {
    from_host_to_L2     = {data_from_host,              data_to_GLOBAL_cache    },
    from_L2_to_host     = {data_from_GLOBAL_cache,      data_to_host            },
    from_L2_to_L1_GEMM  = {data_from_GLOBAL_cache,      data_to_L1_cache_GEMM_in},
    from_L2_to_L1_GEMV  = {data_from_GLOBAL_cache,      data_to_L1_cache_GEMV_in},
    from_L2_to_CVO      = {data_from_GLOBAL_cache,      data_to_CVO_in          },
    from_GEMV_res_to_L2 = {data_from_L1_cache_GEMV_res, data_to_GLOBAL_cache    },
    from_GEMM_res_to_L2 = {data_from_L1_cache_GEMM_res, data_to_GLOBAL_cache    },
    from_CVO_res_to_L2  = {data_from_CVO_res,           data_to_GLOBAL_cache    }
  } data_route_e;

  typedef enum logic [1:0] {
    data_to_fmap_shape   = 2'h0,
    data_to_weight_shape = 2'h1
  } dest_cache_e;

  // ===| Micro-Op Structures (ISA §6) |==========================================

  localparam int MemoryUopWidth = 49;  // 8+17+17+6+1

  // GEMM control uop  (ISA §6.1)
  typedef struct packed {
    flags_t         flags;
    ptr_addr_t      size_ptr_addr;
    parallel_lane_t parallel_lane;
  } gemm_control_uop_t;

  // GEMV control uop  (same layout as GEMM)
  typedef struct packed {
    flags_t         flags;
    ptr_addr_t      size_ptr_addr;
    parallel_lane_t parallel_lane;
  } GEMV_control_uop_t;

  // Memory control uop  (ISA §6.2)
  typedef struct packed {
    data_route_e data_dest;       //  8-bit
    dest_addr_t  dest_addr;       // 17-bit
    src_addr_t   src_addr;        // 17-bit
    ptr_addr_t   shape_ptr_addr;  //  6-bit
    async_e      async;           //  1-bit
  } memory_control_uop_t;

  // Memory set uop  (ISA §6.3)
  typedef struct packed {
    dest_cache_e dest_cache;  //  2-bit
    ptr_addr_t   dest_addr;   //  6-bit
    a_value_t    a_value;     // 16-bit
    b_value_t    b_value;     // 16-bit
    c_value_t    c_value;     // 16-bit
  } memory_set_uop_t;

  // CVO control uop  (ISA §6.4)
  typedef struct packed {
    cvo_func_e  cvo_func;   //  4-bit
    src_addr_t  src_addr;   // 17-bit
    addr_t      dst_addr;   // 17-bit
    length_t    length;     // 16-bit
    cvo_flags_t flags;      //  5-bit
    async_e     async;      //  1-bit
  } cvo_control_uop_t;

  // ===| ACP / NPU Transfer uops (used by mem_dispatcher) |======================
  typedef struct packed {
    logic        write_en;
    logic [16:0] base_addr;
    logic [16:0] end_addr;
  } acp_uop_t;  // 35-bit

  typedef struct packed {
    logic        write_en;
    logic [16:0] base_addr;
    logic [16:0] end_addr;
  } npu_uop_t;  // 35-bit

endpackage : isa_pkg
isa_memctrl.svh
package isa_memctrl;

  `define PORT_MOD_E_WRITE 1
  `define PORT_MOD_E_READ 0

  typedef enum logic [3:0] {
    data_to_host             = 4'h0,
    data_to_GLOBAL_cache         = 4'h1,
    data_to_L1_cache_gemm_in = 4'h2,
    data_to_L1_cache_GEMV_in = 4'h3,
  } data_dest_e;

  typedef enum logic [3:0] {
    data_from_host              = 4'h0,
    data_from_GLOBAL_cache          = 4'h1,
    data_from_L1_cache_gemm_res = 4'h2,
    data_from_L1_cache_GEMV_res = 4'h3
  } data_source_e;

  typedef enum logic [7:0] {
    from_host_to_L2 = {data_from_host, data_to_GLOBAL_cache},

    from_L2_to_host = {data_from_GLOBAL_cache, data_to_host},

    from_L2_to_L1_GEMM = {data_from_GLOBAL_cache, data_to_L1_cache_GEMM_in},
    from_L2_to_L1_GEMV = {data_from_GLOBAL_cache, data_to_L1_cache_GEMV_in},

    from_GEMV_res_to_L2 = {data_from_L1_cache_GEMV_res, data_to_GLOBAL_cache},
    from_GEMM_res_to_L2 = {data_from_L1_cache_GEMM_res, data_to_GLOBAL_cache}
  } data_route_e;


  typedef struct packed {
    data_route_e data_dest;

    dest_addr_t dest_addr;
    src_addr_t  src_addr;

    ptr_addr_t shape_ptr_addr;

    async_e async;
  } memory_control_uop_t;

  typedef enum logic [1:0] {
    data_to_fmap_shape   = 2'h0,
    data_to_weight_shape = 2'h1
  } dest_cache_e;

  typedef struct packed {
    dest_cache_e dest_cache;
    ptr_addr_t   dest_addr;
    a_value_t    a_value;
    b_value_t    b_value;
    c_value_t    c_value;
  } memory_set_uop_t;

  // mem dispatcher.sv
  typedef enum logic {
    NPU_U_OP_WIDTH = 33,
    ACP_U_OP_WIDTH = 33
  } npu_acp_u_op_width_e;

  typedef struct packed {
    logic        acp_write_en_wire;
    logic [16:0] acp_base_addr_wire;
    logic [16:0] acp_end_addr;
  } acp_uop_t;  //33 bit == [32:0]

  typedef struct packed {
    logic        npu_write_en_wire;
    logic [16:0] npu_base_addr_wire;
    logic [16:0] npu_end_addr;
  } npu_uop_t;  //33 bit == [32:0]

endpackage
isa_x32.svh
package isa_x32;
  `define X32_HEADSIZE 6
  typedef logic [16:0] dest_addr_t;
  typedef logic [7:0] loop_cnt_t;


  typedef struct packed {
    logic [31:0] data;
    logic [3:0]  byte_en;
  } x32_payload_t;

  /*─────────────────────────────────────────────
  Opcode table
  ─────────────────────────────────────────────*/
  typedef enum logic [4:0] {
    OP_GEMV   = 4'h0,
    OP_GEMM   = 4'h1,
    OP_MEMCPY = 4'h2
  } opcode_t;


  typedef struct packed {
    logic       to_divice;
    dest_addr_t dest_addr;
    loop_cnt_t  loop_cnt;
  } payload_memcpy_t;

  /*─────────────────────────────────────────────
  Full 32-bit instruction word
  Fixed header (6b) + union payload (26b)
  ─────────────────────────────────────────────*/
  typedef struct packed {
    logic [1:0] cmd_chaining;
    opcode_t    opcode;

    union packed {
      payload_dotm_t  dotm;   // V dot M / M dot M
      payload_memcpy_t memcpy; // memcpy
      override_memcpy_t override_memcpy;
      override_chain_memcpy_t override_chain_memcpy;
      logic [25:0]    raw;
    } payload;  // [25:0]

  } instruction_x32_t;


  //Deprecated
  typedef struct packed {
    logic       to_divice;
    dest_addr_t dest_addr;
    logic [7:0] loop_cnt;
  } memory_uop_x32_t;


endpackage
isa_x64.svh
package isa_x64;
  //  Basic Types
  typedef logic [16:0] dest_addr_t;
  typedef logic [16:0] src_addr_t;
  typedef logic [16:0] addr_t;



  typedef logic [5:0] ptr_addr_t;  // For size and shape pointers
  typedef logic [4:0] parallel_lane_t;

  typedef logic [2:0] reserved_dot;

  typedef struct packed {
    logic [63:0] data;
    logic [7:0]  byte_en;
  } x64_payload_t;


  // npu -> host
  // host -> npu
  typedef enum logic {
    FROM_NPU  = 1'b0,
    FROM_HOST = 1'b1
  } from_device_e;

  typedef enum logic {
    TO_NPU  = 1'b0,
    TO_HOST = 1'b1
  } to_device_e;

  typedef enum logic {
    sync  = 1'b0,
    async = 1'b1
  } async_e;


  //  Flags (6-bit as per PDF spec)
  typedef struct packed {
    logic findemax;
    logic accm;     // Accumulate
    logic w_scale;
    logic [2:0] reserved;
  } flags_t;

  // Instruction format
  // instruction = x64(64bit) - head(opcode)
  typedef logic [59:0] VLIW_instruction_x64;

  //  Opcode table (4-bit)
  typedef enum logic [3:0] {
    OP_GEMV   = 4'h0,
    OP_GEMM   = 4'h1,
    OP_MEMCPY = 4'h2,
    OP_MEMSET = 4'h3
  } opcode_e;


  typedef struct packed {
    dest_addr_t     dest_reg;
    src_addr_t      src_addr;
    flags_t         flags;
    ptr_addr_t      size_ptr_addr;
    ptr_addr_t      shape_ptr_addr;
    parallel_lane_t parallel_lane;
    reserved_dot    reserved;
  } GEMV_op_x64_t;

  typedef struct packed {
    dest_addr_t     dest_reg;
    src_addr_t      src_addr;
    flags_t         flags;
    ptr_addr_t      size_ptr_addr;
    ptr_addr_t      shape_ptr_addr;
    parallel_lane_t parallel_lane;
    reserved_dot    reserved;
  } GEMM_op_x64_t;

  typedef struct packed {
    from_device_e from_device;
    to_device_e   to_device;
    dest_addr_t   dest_addr;
    src_addr_t    src_addr;
    addr_t        _addr;
    ptr_addr_t    shape_ptr_addr;
    async_e       async;
  } memcpy_op_x64_t;

  typedef struct packed {
    logic [1:0] dest_cache;
    ptr_addr_t  dest_addr;
    a_value_t   a_value;
    b_value_t   b_value;
    c_value_t   c_value;
    logic       reserved;
  } memset_op_x64_t;


  typedef struct packed {
    // head(opcode) removed
    logic [59:0] instruction;
  } instruction_op_x64_t;


  // --------------------------------------------------------
  // ===| Compute Micro-Op |=================================
  `define MEMORY_UOP_WIDTH 49

  typedef struct packed {
    flags_t    flags;
    ptr_addr_t size_ptr_addr;
    parallel_lane_t parallel_lane;
  } gemm_control_uop_t;


  typedef struct packed {
    flags_t    flags;
    ptr_addr_t size_ptr_addr;
    parallel_lane_t parallel_lane;
  } GEMV_control_uop_t;

  // ===| Compute Micro-Op |====================================
  // -----------------------------------------------------------

endpackage

Interface definitions

  • npu_interfaces.svh — SystemVerilog interface blocks used as typed handles between blocks.

npu_interfaces.svh
`include "GLOBAL_CONST.svh"

`ifndef NPU_INTERFACES_SVH
`define NPU_INTERFACES_SVH

interface axis_if #(
    parameter DATA_WIDTH = 128
) ();
  logic [    DATA_WIDTH-1:0] tdata;
  logic                      tvalid;
  logic                      tready;
  logic                      tlast;
  logic [(DATA_WIDTH/8)-1:0] tkeep;

  // Slave Side (NPU Perspective: Input)
  modport slave(input tdata, tvalid, tlast, tkeep, output tready);

  // Master Side (NPU Perspective: Output)
  modport master(output tdata, tvalid, tlast, tkeep, input tready);
endinterface

// axil_if.sv
interface axil_if #(
    parameter int ADDR_W = 12,
    parameter int DATA_W = 64
) (
    input logic clk,
    input logic rst_n
);
  // AW Channel
  logic [ADDR_W-1:0] awaddr;
  logic [       2:0] awprot;
  logic awvalid, awready;

  // W Channel
  logic [    DATA_W-1:0] wdata;
  logic [(DATA_W/8)-1:0] wstrb;
  logic wvalid, wready;

  // B Channel
  logic [1:0] bresp;
  logic bvalid, bready;

  // AR Channel
  logic [ADDR_W-1:0] araddr;
  logic [       2:0] arprot;
  logic arvalid, arready;

  // R Channel
  logic [DATA_W-1:0] rdata;
  logic [       1:0] rresp;
  logic rvalid, rready;

  modport slave(
      input awaddr, awprot, awvalid, wdata, wstrb, wvalid, bready,
      input araddr, arprot, arvalid, rready,
      output awready, wready, bresp, bvalid, arready, rdata, rresp, rvalid
  );

  modport master(
      output awaddr, awprot, awvalid, wdata, wstrb, wvalid, bready,
      output araddr, arprot, arvalid, rready,
      input awready, wready, bresp, bvalid, arready, rdata, rresp, rvalid
  );
endinterface



`endif  // NPU_INTERFACES_SVH