NPU Controller¶

The controller is the front-end + scheduler half of the NPU. It accepts 64-bit VLIW instructions over AXI-Lite, decodes them, pushes the resulting micro-ops into per-engine FIFOs, and exposes an status-register back to the host. All compute cores operate strictly downstream of the controller’s FIFOs.

Topology¶

Host (AXI-Lite) ──► AXIL_CMD_IN ──► ctrl_npu_decoder ─┐
                                                     ▼
┌── GEMV FIFO ── GEMM FIFO ── CVO FIFO ── MEM FIFO ── MEMSET FIFO ──┐
│                                                                    │
│            ctrl_npu_dispatcher (per-engine pop)                     │
│                                                                    │
└────────────► Global_Scheduler ◄────────────────────────────────────┘
                                                                     │
                                     NPU_fsm_out_Logic ──► AXIL_STAT_OUT

Frontend (AXI-Lite surface)¶

ctrl_npu_frontend.sv — container for the AXIL surface; hosts the interface slaves.
AXIL_CMD_IN.sv — AXI-Lite write slave. Latches 64-bit instructions from 32-bit-at-a-time writes at 0x00 / 0x04.
AXIL_STAT_OUT.sv — AXI-Lite read slave that exposes the BUSY/DONE status register at 0x08.
ctrl_npu_interface.sv — internal handshake glue between the frontend and the Control Unit.

ctrl_npu_frontend.sv

`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "npu_interfaces.svh"
`include "GLOBAL_CONST.svh"

module ctrl_npu_frontend (
    input logic clk,
    input logic rst_n,
    input logic IN_clear,

    // AXI4-Lite Slave : PS <-> NPU control plane
    axil_if.slave S_AXIL_CTRL,

    // Control from Brain
    //input logic IN_rd_start,

    // Decoded command -> Dispatcher / FSM
    output logic [`ISA_WIDTH-1:0] OUT_RAW_instruction,
    output logic                  OUT_kick,

    // Status <- Encoder / FSM
    input logic [`ISA_WIDTH:0] IN_enc_stat,
    input logic                IN_enc_valid, // FIXED: Added missing comma

    input logic IN_fetch_ready  // FIXED: Removed illegal semicolon
);

  /*─────────────────────────────────────────────
  Internal wires : AXIL_CMD_IN <-> upper logic
  ───────────────────────────────────────────────*/
  logic [`ISA_WIDTH-1:0] cmd_data;
  logic                  cmd_valid;
  // logic               decoder_ready; // (Unused wire commented out)

  // FIXED: Removed 'assign IN_fetch_ready = IN_fetch_ready;'
  // (You cannot continuously assign an input to itself in SystemVerilog)

  assign OUT_RAW_instruction = cmd_data;
  assign OUT_kick            = cmd_valid & IN_fetch_ready;

  /*─────────────────────────────────────────────
  [1-2] Communication IN : CPU -> NPU (Using Write Channels)
  ───────────────────────────────────────────────*/
  AXIL_CMD_IN #(
      .FIFO_DEPTH(8)
  ) u_cmd_in (
      .clk     (clk),
      .rst_n   (rst_n),
      .IN_clear(IN_clear), // FIXED: Typo i_clear -> IN_clear

      // AXI4-Lite Write channels directly routed from the interface
      .s_awaddr (S_AXIL_CTRL.awaddr),
      .s_awvalid(S_AXIL_CTRL.awvalid),
      .s_awready(S_AXIL_CTRL.awready),
      .s_wdata  (S_AXIL_CTRL.wdata),
      .s_wvalid (S_AXIL_CTRL.wvalid),
      .s_wready (S_AXIL_CTRL.wready),
      .s_bresp  (S_AXIL_CTRL.bresp),
      .s_bvalid (S_AXIL_CTRL.bvalid),
      .s_bready (S_AXIL_CTRL.bready),

      .OUT_data(cmd_data),
      .OUT_valid(cmd_valid),
      .IN_decoder_ready(IN_fetch_ready)
  );

  /*─────────────────────────────────────────────
  [1-2] Communication OUT : NPU -> CPU (Using Read Channels)
  ───────────────────────────────────────────────*/
  AXIL_STAT_OUT #(
      .FIFO_DEPTH(8)
  ) u_stat_out (
      .clk     (clk),
      .rst_n   (rst_n),
      .IN_clear(IN_clear), // FIXED: Typo i_clear -> IN_clear

      .IN_data (IN_enc_stat),  // FIXED: Typo i_enc_stat -> IN_enc_stat
      .IN_valid(IN_enc_valid), // FIXED: Typo i_enc_valid -> IN_enc_valid

      // AXI4-Lite Read channels directly routed from the interface
      .s_araddr (S_AXIL_CTRL.araddr),
      .s_arvalid(S_AXIL_CTRL.arvalid),
      .s_arready(S_AXIL_CTRL.arready),
      .s_rdata  (S_AXIL_CTRL.rdata),
      .s_rresp  (S_AXIL_CTRL.rresp),
      .s_rvalid (S_AXIL_CTRL.rvalid),
      .s_rready (S_AXIL_CTRL.rready)
  );

endmodule

AXIL_CMD_IN.sv

`timescale 1ns / 1ps

`include "Algorithms.svh"
`include "GLOBAL_CONST.svh"

// AXIL_CMD_IN
// AXI4-Lite Write path : CPU → NPU
// Stores incoming commands into a FIFO.
// Drains FIFO to upper module when IN_decoder_ready is asserted.

module AXIL_CMD_IN #(
    parameter FIFO_DEPTH = 8  // number of commands to buffer
) (
    input logic clk,
    input logic rst_n,
    input logic IN_clear,

    // AXI4-Lite Write channels (slave)
    // AW
    input logic [11:0] s_awaddr,
    input logic [2:0] s_awprot,
    input logic s_awvalid,
    output logic s_awready,
    // W
    input logic [`ISA_WIDTH-1:0] s_wdata,
    input logic [(`ISA_WIDTH/8)-1:0] s_wstrb,
    input logic s_wvalid,
    output logic s_wready,
    // B
    output logic [1:0] s_bresp,
    output logic s_bvalid,
    input logic s_bready,

    // To upper module (NPU_interface)
    output logic [`ISA_WIDTH-1:0] OUT_data,         // command word
    output logic                  OUT_valid,        // FIFO has data
    input  logic                  IN_decoder_ready  // upper module is ready to consume
);

  /*─────────────────────────────────────────────
  Register Address Map
  ───────────────────────────────────────────────*/
  localparam ADDR_INST = 12'h000;
  localparam ADDR_KICK = 12'h008;

  /*─────────────────────────────────────────────
  AXI4-Lite Write Path
  Latch AW first, write register when W arrives.
  ───────────────────────────────────────────────*/
  logic [          11:0] aw_addr_latch;
  logic                  aw_pending;
  logic                  bvalid_r;
  logic                  fifo_wen;
  logic [`ISA_WIDTH-1:0] fifo_wdata;

  // if queue is full block receive
  assign s_awready = ~aw_pending && ~cmd_q.full;
  assign s_wready  = aw_pending;
  assign s_bresp   = 2'b00;
  assign s_bvalid  = bvalid_r;

  // AW latch
  always_ff @(posedge clk) begin
    if (!rst_n || IN_clear) begin
      aw_addr_latch <= '0;
      aw_pending    <= 1'b0;
    end else begin
      if (s_awvalid && s_awready) begin
        aw_addr_latch <= s_awaddr;
        aw_pending    <= 1'b1;
      end
      if (s_wvalid && s_wready) aw_pending <= 1'b0;
    end
  end

  // W : push into FIFO + B response
  always_ff @(posedge clk) begin
    fifo_wen <= 1'b0;
    bvalid_r <= 1'b0;

    if (!rst_n || IN_clear) begin
      fifo_wdata <= '0;
    end else begin
      if (s_wvalid && s_wready) begin
        case (aw_addr_latch)
          // push instruction word into FIFO
          ADDR_INST: begin
            fifo_wdata <= s_wdata;
            fifo_wen   <= 1'b1;
          end
          // KICK : push a special marker (bit63 = 1 as kick flag)
          ADDR_KICK: begin
            fifo_wdata <= 64'h8000_0000_0000_0000;
            fifo_wen   <= 1'b1;
          end
          default: ;
        endcase
        bvalid_r <= 1'b1;
      end
      if (bvalid_r && s_bready) bvalid_r <= 1'b0;
    end
  end

  /*─────────────────────────────────────────────
  Command Queue  (simple synchronous, FIFO_DEPTH entries)
  Push : fifo_wen
  Pop  : OUT_valid && IN_decoder_ready
  ───────────────────────────────────────────────*/
  import algorithms_pkg::*;

  IF_queue #(
      .DATA_WIDTH(`ISA_WIDTH),
      .DEPTH(FIFO_DEPTH)
  ) cmd_q (
      .clk  (clk),
      .rst_n(rst_n)
  );
  QUEUE u_cmd_q (.q(cmd_q.owner));

  always_ff @(posedge clk) begin
    if (!rst_n || IN_clear) begin
      cmd_q.clear();
    end else begin
      if (fifo_wen) cmd_q.push(fifo_wdata);  // push when AXI write done
      if (OUT_valid && IN_decoder_ready) cmd_q.pop();  // IXED: o_valid -> OUT_valid
    end
  end

  assign OUT_valid = ~cmd_q.empty;
  assign OUT_data  = cmd_q.pop_data;

endmodule

AXIL_STAT_OUT.sv

`timescale 1ns / 1ps

// AXIL_STAT_OUT
// AXI4-Lite Read path : NPU → CPU
// Upper module pushes status into FIFO continuously.
// Drains FIFO to CPU when AXI4-Lite read handshake happens.

module AXIL_STAT_OUT #(
    parameter FIFO_DEPTH = 8
) (
    input logic clk,
    input logic rst_n,
    input logic IN_clear,

    // From upper module (NPU_interface → here)
    input logic [`ISA_WIDTH-1:0] IN_data,  // status word to send to CPU
    input logic                  IN_valid, // upper module has data to push

    // AXI4-Lite Read channels (slave)
    // AR
    input  logic [          11:0] s_araddr,
    input  logic                  s_arvalid,
    output logic                  s_arready,
    // R
    output logic [`ISA_WIDTH-1:0] s_rdata,
    output logic [           1:0] s_rresp,
    output logic                  s_rvalid,
    input  logic                  s_rready
);

  /*─────────────────────────────────────────────
  FIFO  (simple synchronous, FIFO_DEPTH entries)
  Push : IN_valid from upper module
  Pop  : AXI4-Lite read handshake with CPU
  ───────────────────────────────────────────────*/
  localparam PTR_W = $clog2(FIFO_DEPTH);

  logic [`ISA_WIDTH-1:0] mem[0:FIFO_DEPTH-1];
  logic [PTR_W:0] wr_ptr, rd_ptr;
  logic fifo_empty, fifo_full;

  assign fifo_empty = (wr_ptr == rd_ptr);
  assign fifo_full  = (wr_ptr[PTR_W] != rd_ptr[PTR_W]) && (wr_ptr[PTR_W-1:0] == rd_ptr[PTR_W-1:0]);

  logic fifo_ren;

  always_ff @(posedge clk) begin
    if (!rst_n || IN_clear) begin
      wr_ptr <= '0;
      rd_ptr <= '0;
    end else begin
      // push : upper module feeds status continuously
      if (IN_valid && !fifo_full) begin
        mem[wr_ptr[PTR_W-1:0]] <= IN_data;
        wr_ptr <= wr_ptr + 1'b1;
      end
      // pop : CPU consumed the data
      if (fifo_ren && !fifo_empty) rd_ptr <= rd_ptr + 1'b1;
    end
  end

  /*─────────────────────────────────────────────
  AXI4-Lite Read Path
  Wait for AR, then pop one entry from FIFO and return it.
  Hold rvalid until CPU acknowledges with rready.
  ───────────────────────────────────────────────*/
  logic [`ISA_WIDTH-1:0] rdata_r;
  logic                  rvalid_r;

  assign s_rdata   = rdata_r;
  assign s_rresp   = 2'b00;
  assign s_rvalid  = rvalid_r;
  assign s_arready = ~rvalid_r && ~fifo_empty;  // ready only when FIFO has data
  assign fifo_ren  = s_arvalid && s_arready;  // pop on AR handshake

  always_ff @(posedge clk) begin
    if (!rst_n || IN_clear) begin
      rdata_r  <= '0;
      rvalid_r <= 1'b0;
    end else begin
      // AR handshake → latch FIFO head and assert rvalid
      if (s_arvalid && s_arready) begin
        rdata_r  <= mem[rd_ptr[PTR_W-1:0]];
        rvalid_r <= 1'b1;
      end
      // R handshake → CPU consumed data, release
      if (rvalid_r && s_rready) rvalid_r <= 1'b0;
    end
  end

endmodule

Control Unit (decode + dispatch)¶

ctrl_decode_const.svh — decode-stage constants (field widths, masks).
ctrl_npu_decoder.sv — strips the 4-bit opcode from a 64-bit VLIW instruction and routes the 60-bit body to the correct FIFO.
ctrl_npu_dispatcher.sv — per-engine local dispatcher: pops a queued micro-op and fires the engine when operands are ready.

ctrl_npu_decoder.sv

`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| NPU Opcode Decoder |======================================================
// Receives raw 64-bit VLIW instructions from the frontend FIFO.
// Strips the 4-bit opcode, asserts the matching valid pulse for one cycle,
// and forwards the 60-bit body to the Global Scheduler.
// ===============================================================================

module ctrl_npu_decoder (
    input logic clk,
    input logic rst_n,

    // ===| From Frontend |=======================================================
    input logic [`ISA_WIDTH-1:0] IN_raw_instruction,
    input logic                  raw_instruction_pop_valid,

    // ===| Flow Control |========================================================
    output logic OUT_fetch_PC_ready,

    // ===| Decoded Valid Pulses (one-hot, one cycle) |===========================
    output logic OUT_GEMV_op_x64_valid,
    output logic OUT_GEMM_op_x64_valid,
    output logic OUT_memcpy_op_x64_valid,
    output logic OUT_memset_op_x64_valid,
    output logic OUT_cvo_op_x64_valid,

    // ===| Instruction Body (60-bit, opcode stripped) |=========================
    output instruction_op_x64_t OUT_op_x64
);

  // ===| Internal |==============================================================
  logic [3:0] OUT_valid;
  assign OUT_GEMV_op_x64_valid   = OUT_valid[0];
  assign OUT_GEMM_op_x64_valid   = OUT_valid[1];
  assign OUT_memcpy_op_x64_valid = OUT_valid[2];
  assign OUT_memset_op_x64_valid = OUT_valid[3];
  // CVO valid uses a separate FF (5th opcode)
  logic cvo_valid_ff;
  assign OUT_cvo_op_x64_valid = cvo_valid_ff;

  // ===| Opcode Decoder |========================================================
  // Top 4 bits are the opcode; bottom 60 bits are the instruction body.
  always_ff @(posedge clk) begin
    if (!rst_n) begin
      OUT_valid        <= 4'b0000;
      cvo_valid_ff     <= 1'b0;
      OUT_fetch_PC_ready <= `TRUE;
      OUT_op_x64       <= '0;
    end else begin
      OUT_valid      <= 4'b0000;
      cvo_valid_ff   <= 1'b0;

      if (raw_instruction_pop_valid) begin
        // Body: bits [59:0] (opcode at [63:60] already stripped by slicing)
        OUT_op_x64.instruction <= IN_raw_instruction[`ISA_BODY_WIDTH-1:0];

        case (IN_raw_instruction[`ISA_WIDTH-1:`ISA_WIDTH-`ISA_OPCODE_WIDTH])
          OP_GEMV:   OUT_valid <= 4'b0001;
          OP_GEMM:   OUT_valid <= 4'b0010;
          OP_MEMCPY: OUT_valid <= 4'b0100;
          OP_MEMSET: OUT_valid <= 4'b1000;
          OP_CVO:    cvo_valid_ff <= 1'b1;
          default:   ;  // unknown opcode: drop silently
        endcase
      end
    end
  end

  // ===| Backpressure |==========================================================
  // Always ready — the frontend FIFO provides buffering; the decoder is single-cycle.
  assign OUT_fetch_PC_ready = 1'b1;

endmodule

ctrl_npu_dispatcher.sv

// `timeOUT_scale 1ns / 1ps
// `include "GEMM_Array.svh"
// `include "npu_interfaces.svh"
// `include "GLOBAL_CONST.svh"

// import isa_pkg::*;

// module cu_npu_dispatcher (
//     input  logic         clk,
//     input  logic         rst_n,
//     input  instruction_t IN_inst,
//     input  logic         IN_valid,
//     output logic         o_valid,


//     // GEMV / GEMM controls
//     output logic [3:0] OUT_activate_top,
//     output logic [3:0] OUT_activate_lane,
//     output logic       OUT_result_emax_align,
//     output logic       OUT_result_accm,
//     output logic       OUT_result_scale,


//     // memcpy
//     output memory_uop_t OUT_memcpy_cmd,

//     // if INT group size?
// );

//   /*─────────────────────────────────────────────
//   Lane activation bitmask
//   bit[0]=lane1, bit[1]=lane2 ...
//   ─────────────────────────────────────────────*/
//   localparam logic [3:0] LANE_1 = 4'b0001;
//   localparam logic [3:0] LANE_2 = 4'b0010;
//   localparam logic [3:0] LANE_3 = 4'b0100;
//   localparam logic [3:0] LANE_4 = 4'b1000;

//   always_ff @(posedge clk) begin
//     if (!rst_n) begin
//       o_valid           <= 1'b0;
//       OUT_activate_lane     <= '0;
//       OUT_result_emax_align        <= 1'b0;
//       OUT_result_accm              <= 1'b0;
//       OUT_result_scale             <= 1'b0;
//       OUT_memcpy_destination_queue <= '0;
//       for (int i = 0; i < `MAX_MATRIX_DIM; i++) OUT_memcpy_matrix_shape[i] <= '0;
//     end else begin

//       o_valid <= 1'b0;  // default : deassert every cycle

//       if (IN_valid) begin
//         case (IN_inst.opcode)

//           OP_GEMV: begin
//             o_valid <= 1'b1;

//             if (IN_inst.cmd_chaining) begin
//               // TODO: chaining logic
//             end
//             if (IN_inst.override) begin
//               // TODO: override logic
//             end

//             // lane activation (OR mask, cumulative)
//             case (IN_inst.payload.dotm.lane_idq)
//               2'b00: OUT_activate_lane <= LANE_1;
//               2'b01: OUT_activate_lane <= LANE_1 | LANE_2;
//               2'b10: OUT_activate_lane <= LANE_1 | LANE_2 | LANE_3;
//               2'b11: OUT_activate_lane <= LANE_1 | LANE_2 | LANE_3 | LANE_4;
//               default: begin
//                 o_valid <= 1'b0;  // unknown → drop + TODO: interrupt
//               end
//             endcase

//             OUT_result_emax_align <= IN_inst.payload.dotm.find_emax_align;
//             OUT_result_accm       <= IN_inst.payload.dotm.OUT_result_accm;
//             // activate when added to ISA
//             // OUT_result_scale      <= IN_inst.payload.dotm.OUT_result_scale;
//             OUT_activate_top[`TOP_GEMV] <= `TRUE;
//           end

//           OP_GEMM: begin
//             if (IN_inst.override) begin
//               if (IN_inst.cmd_chaining) begin
//                 // TODO
//               end else begin
//                 // TODO
//               end
//             end else begin
//               if (IN_inst.cmd_chaining) begin
//                 // TODO
//               end else begin
//                 o_valid <= 1'b1;

//                 OUT_result_emax_align <= IN_inst.payload.dotm.align;
//                 OUT_result_accm       <= IN_inst.payload.dotm.OUT_result_accm;
//                 OUT_activate_top[`TOP_GEMV] <= `TRUE;
//               end
//             end

//           end

//           OP_MEMCPY: begin
//             if (IN_inst.override) begin
//               if (IN_inst.cmd_chaining) begin
//                 // accumulate matrix shape across chained instructions
//                 OUT_memcpy_matrix_shape[IN_inst.payload.memcpy.dim_xyz]
//                     <= IN_inst.payload.memcpy.dim_x;


//               end else begin
//                 // chaining end → dispatch memcpy
//                 o_valid           <= 1'b1;
//                 OUT_memcpy_destination_queue <= IN_inst.payload.memcpy.dest_queue;

//                 case (IN_inst.payload.memcpy.dest_queue[3:2])
//                   `MASKING_WEIGHT: begin
//                     // TODO: → weight buffer
//                   end
//                   `MASKING_OUT_scale: begin
//                     // TODO: ACP → OUT_result_scale cache
//                   end
//                   `MASKING_FMAP: begin
//                     // TODO: ACP → find emax & align → cache
//                   end
//                   default: o_valid <= 1'b0;  // undefined
//                 endcase
//               end

//             end else begin
//               // non-override memcpy
//               // TODO
//             end

//             // Determine logic based on datatype and mask IN_inst.payload.memcpy.option_flags using bitwise AND (&)
//             if (IN_inst.payload.memcpy.datatype == `BF16) begin
//                 // Example: BF16 processing mode
//                 // Check if the 4th bit (ALIGN) is set to 1
//                 if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN) != 4'b0000) begin
//                     OUT_align <= `TRUE;

//                     // Determine the alignment direction
//                     if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN_V) != 4'b0000) begin
//                         OUT_align_dir <= `ALIGN_VERTICAL;
//                     end else if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN_H) != 4'b0000) begin
//                         OUT_align_dir <= `ALIGN_HORIZONTAL;
//                     end else begin
//                         // Default direction if neither V nor H is specified
//                     end

//                 end else begin
//                     // If ALIGN flag is missing
//                     OUT_align <= `FALSE;
//                 end
//             end else begin
//                 // Example: INT processing mode
//                 if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_OPT_INT_IS_SCALED) != 4'b0000) begin
//                     // Logic for scaled INT
//                     OUT_align <= `TRUE; // (Adjust according to your actual spec)
//                 end else begin
//                     OUT_align <= `FALSE;
//                 end
//             end

//             OUT_datatype <= IN_inst.payload.memcpy.datatype;

//           end
//           default: o_valid <= 1'b0;  // unknown opcode → drop
//         endcase
//       end
//     end
//   end

// endmodule

Global Scheduler + controller top¶

Global_Scheduler.sv — cross-engine arbitration: orders memory transfers vs. compute, handles ACC / FINDEMAX hazards.
npu_controller_top.sv — controller top-level wrapper; instantiates frontend + decode/dispatch + scheduler and wires them to the npu_if bundle.

Global_Scheduler.sv

`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| Global Scheduler |========================================================
// Translates decoded VLIW instructions into engine micro-ops.
//
// Single always_ff drives each output to avoid multiple-driver conflicts.
// Priority for OUT_LOAD_uop: GEMM > GEMV > MEMCPY > CVO (one active per cycle).
//
// OUT_STORE_uop  : registered at issue time; mem_dispatcher uses it to initiate
//                  result writeback after the engine signals completion.
// OUT_sram_rd_start : one-cycle pulse when a GEMM or GEMV load is dispatched,
//                     triggering preprocess_fmap to begin broadcasting from cache.
// ===============================================================================

module Global_Scheduler #() (
    input logic clk_core,
    input logic rst_n_core,

    // ===| From ctrl_npu_decoder |===============================================
    input logic IN_GEMV_op_x64_valid,
    input logic IN_GEMM_op_x64_valid,
    input logic IN_memcpy_op_x64_valid,
    input logic IN_memset_op_x64_valid,
    input logic IN_cvo_op_x64_valid,

    input instruction_op_x64_t instruction,

    // ===| Engine micro-ops |====================================================
    output gemm_control_uop_t   OUT_GEMM_uop,
    output GEMV_control_uop_t   OUT_GEMV_uop,
    output memory_control_uop_t OUT_LOAD_uop,
    output memory_control_uop_t OUT_STORE_uop,
    output memory_set_uop_t     OUT_mem_set_uop,
    output cvo_control_uop_t    OUT_CVO_uop,

    // ===| Datapath control |====================================================
    output logic OUT_sram_rd_start   // pulse: start fmap cache broadcast
);

  // ===| Combinational instruction body casts |==================================
  GEMV_op_x64_t   GEMV_op_x64;
  GEMM_op_x64_t   GEMM_op_x64;
  memcpy_op_x64_t memcpy_op_x64;
  memset_op_x64_t memset_op_x64;
  cvo_op_x64_t    cvo_op_x64;

  always_comb begin
    GEMV_op_x64   = GEMV_op_x64_t'(instruction.instruction);
    GEMM_op_x64   = GEMM_op_x64_t'(instruction.instruction);
    memcpy_op_x64 = memcpy_op_x64_t'(instruction.instruction);
    memset_op_x64 = memset_op_x64_t'(instruction.instruction);
    cvo_op_x64    = cvo_op_x64_t'(instruction.instruction);
  end

  // ===| MEMSET uop |============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_mem_set_uop <= '0;
    end else if (IN_memset_op_x64_valid) begin
      OUT_mem_set_uop <= '{
          dest_cache : dest_cache_e'(memset_op_x64.dest_cache),
          dest_addr  : memset_op_x64.dest_addr,
          a_value    : memset_op_x64.a_value,
          b_value    : memset_op_x64.b_value,
          c_value    : memset_op_x64.c_value
      };
    end
  end

  // ===| MEMCPY route translation ===============================================
  // from_device/to_device (1-bit each) → data_route_e (8-bit enum)
  data_route_e memcpy_route;
  always_comb begin
    if (memcpy_op_x64.from_device == FROM_HOST && memcpy_op_x64.to_device == TO_NPU)
      memcpy_route = from_host_to_L2;
    else
      memcpy_route = from_L2_to_host;
  end

  // ===| LOAD uop — single driver (priority: GEMM > GEMV > MEMCPY > CVO) |======
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_LOAD_uop      <= '0;
      OUT_sram_rd_start <= 1'b0;
    end else begin
      OUT_sram_rd_start <= 1'b0;   // default: no pulse

      if (IN_GEMM_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_L1_GEMM,
            dest_addr      : '0,
            src_addr       : GEMM_op_x64.src_addr,
            shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
            async          : SYNC_OP
        };
        OUT_sram_rd_start <= 1'b1;

      end else if (IN_GEMV_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_L1_GEMV,
            dest_addr      : '0,
            src_addr       : GEMV_op_x64.src_addr,
            shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
            async          : SYNC_OP
        };
        OUT_sram_rd_start <= 1'b1;

      end else if (IN_memcpy_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : memcpy_route,
            dest_addr      : memcpy_op_x64.dest_addr,
            src_addr       : memcpy_op_x64.src_addr,
            shape_ptr_addr : memcpy_op_x64.shape_ptr_addr,
            async          : memcpy_op_x64.async
        };

      end else if (IN_cvo_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_CVO,
            dest_addr      : '0,
            src_addr       : cvo_op_x64.src_addr,
            shape_ptr_addr : '0,
            async          : cvo_op_x64.async
        };
      end
    end
  end

  // ===| STORE uop — latched at issue time |=====================================
  // Held until the engine signals completion (external handshake, not shown here).
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_STORE_uop <= '0;
    end else if (IN_GEMM_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_GEMM_res_to_L2,
          dest_addr      : GEMM_op_x64.dest_reg,
          src_addr       : '0,
          shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
          async          : SYNC_OP
      };
    end else if (IN_GEMV_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_GEMV_res_to_L2,
          dest_addr      : GEMV_op_x64.dest_reg,
          src_addr       : '0,
          shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
          async          : SYNC_OP
      };
    end else if (IN_cvo_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_CVO_res_to_L2,
          dest_addr      : cvo_op_x64.dst_addr,
          src_addr       : '0,
          shape_ptr_addr : '0,
          async          : cvo_op_x64.async
      };
    end
  end

  // ===| GEMM uop |==============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_GEMM_uop <= '0;
    end else if (IN_GEMM_op_x64_valid) begin
      OUT_GEMM_uop <= '{
          flags         : GEMM_op_x64.flags,
          size_ptr_addr : GEMM_op_x64.size_ptr_addr,
          parallel_lane : GEMM_op_x64.parallel_lane
      };
    end
  end

  // ===| GEMV uop |==============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_GEMV_uop <= '0;
    end else if (IN_GEMV_op_x64_valid) begin
      OUT_GEMV_uop <= '{
          flags         : GEMV_op_x64.flags,
          size_ptr_addr : GEMV_op_x64.size_ptr_addr,
          parallel_lane : GEMV_op_x64.parallel_lane
      };
    end
  end

  // ===| CVO uop |===============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_CVO_uop <= '0;
    end else if (IN_cvo_op_x64_valid) begin
      OUT_CVO_uop <= '{
          cvo_func : cvo_func_e'(cvo_op_x64.cvo_func),
          src_addr : cvo_op_x64.src_addr,
          dst_addr : cvo_op_x64.dst_addr,
          length   : cvo_op_x64.length,
          flags    : cvo_flags_t'(cvo_op_x64.flags),
          async    : cvo_op_x64.async
      };
    end
  end

endmodule

FSM Out Logic (status aggregation)¶

fsmout_npu_stat_collector.sv — samples per-engine busy/done flags.
fsmout_npu_stat_encoder.sv — encodes collected flags into the 32-bit status register exposed by AXIL_STAT_OUT.