NPU Controller

The controller is the front-end + scheduler half of the NPU. It accepts 64-bit VLIW instructions over AXI-Lite, decodes them, pushes the resulting micro-ops into per-engine FIFOs, and exposes an status-register back to the host. All compute cores operate strictly downstream of the controller’s FIFOs.

See also

pccx ISA Specification

Instruction layout and opcode table consumed by the decoder below.

Topology

Host (AXI-Lite) ──► AXIL_CMD_IN ──► ctrl_npu_decoder ─┐
                                                     ▼
┌── GEMV FIFO ── GEMM FIFO ── CVO FIFO ── MEM FIFO ── MEMSET FIFO ──┐
│                                                                    │
│            ctrl_npu_dispatcher (per-engine pop)                     │
│                                                                    │
└────────────► Global_Scheduler ◄────────────────────────────────────┘
                                                                     │
                                     NPU_fsm_out_Logic ──► AXIL_STAT_OUT

Frontend (AXI-Lite surface)

  • ctrl_npu_frontend.sv — container for the AXIL surface; hosts the interface slaves.

  • AXIL_CMD_IN.sv — AXI-Lite write slave. Latches 64-bit instructions from 32-bit-at-a-time writes at 0x00 / 0x04.

  • AXIL_STAT_OUT.sv — AXI-Lite read slave that exposes the BUSY/DONE status register at 0x08.

  • ctrl_npu_interface.sv — internal handshake glue between the frontend and the Control Unit.

ctrl_npu_frontend.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "npu_interfaces.svh"
`include "GLOBAL_CONST.svh"

module ctrl_npu_frontend (
    input logic clk,
    input logic rst_n,
    input logic IN_clear,

    // AXI4-Lite Slave : PS <-> NPU control plane
    axil_if.slave S_AXIL_CTRL,

    // Control from Brain
    //input logic IN_rd_start,

    // Decoded command -> Dispatcher / FSM
    output logic [`ISA_WIDTH-1:0] OUT_RAW_instruction,
    output logic                  OUT_kick,

    // Status <- Encoder / FSM
    input logic [`ISA_WIDTH:0] IN_enc_stat,
    input logic                IN_enc_valid, // FIXED: Added missing comma

    input logic IN_fetch_ready  // FIXED: Removed illegal semicolon
);

  /*─────────────────────────────────────────────
  Internal wires : AXIL_CMD_IN <-> upper logic
  ───────────────────────────────────────────────*/
  logic [`ISA_WIDTH-1:0] cmd_data;
  logic                  cmd_valid;
  // logic               decoder_ready; // (Unused wire commented out)

  // FIXED: Removed 'assign IN_fetch_ready = IN_fetch_ready;'
  // (You cannot continuously assign an input to itself in SystemVerilog)

  assign OUT_RAW_instruction = cmd_data;
  assign OUT_kick            = cmd_valid & IN_fetch_ready;

  /*─────────────────────────────────────────────
  [1-2] Communication IN : CPU -> NPU (Using Write Channels)
  ───────────────────────────────────────────────*/
  AXIL_CMD_IN #(
      .FIFO_DEPTH(8)
  ) u_cmd_in (
      .clk     (clk),
      .rst_n   (rst_n),
      .IN_clear(IN_clear), // FIXED: Typo i_clear -> IN_clear

      // AXI4-Lite Write channels directly routed from the interface
      .s_awaddr (S_AXIL_CTRL.awaddr),
      .s_awvalid(S_AXIL_CTRL.awvalid),
      .s_awready(S_AXIL_CTRL.awready),
      .s_wdata  (S_AXIL_CTRL.wdata),
      .s_wvalid (S_AXIL_CTRL.wvalid),
      .s_wready (S_AXIL_CTRL.wready),
      .s_bresp  (S_AXIL_CTRL.bresp),
      .s_bvalid (S_AXIL_CTRL.bvalid),
      .s_bready (S_AXIL_CTRL.bready),

      .OUT_data(cmd_data),
      .OUT_valid(cmd_valid),
      .IN_decoder_ready(IN_fetch_ready)
  );

  /*─────────────────────────────────────────────
  [1-2] Communication OUT : NPU -> CPU (Using Read Channels)
  ───────────────────────────────────────────────*/
  AXIL_STAT_OUT #(
      .FIFO_DEPTH(8)
  ) u_stat_out (
      .clk     (clk),
      .rst_n   (rst_n),
      .IN_clear(IN_clear), // FIXED: Typo i_clear -> IN_clear

      .IN_data (IN_enc_stat),  // FIXED: Typo i_enc_stat -> IN_enc_stat
      .IN_valid(IN_enc_valid), // FIXED: Typo i_enc_valid -> IN_enc_valid

      // AXI4-Lite Read channels directly routed from the interface
      .s_araddr (S_AXIL_CTRL.araddr),
      .s_arvalid(S_AXIL_CTRL.arvalid),
      .s_arready(S_AXIL_CTRL.arready),
      .s_rdata  (S_AXIL_CTRL.rdata),
      .s_rresp  (S_AXIL_CTRL.rresp),
      .s_rvalid (S_AXIL_CTRL.rvalid),
      .s_rready (S_AXIL_CTRL.rready)
  );

endmodule
AXIL_CMD_IN.sv
`timescale 1ns / 1ps

`include "Algorithms.svh"
`include "GLOBAL_CONST.svh"

// AXIL_CMD_IN
// AXI4-Lite Write path : CPU → NPU
// Stores incoming commands into a FIFO.
// Drains FIFO to upper module when IN_decoder_ready is asserted.

module AXIL_CMD_IN #(
    parameter FIFO_DEPTH = 8  // number of commands to buffer
) (
    input logic clk,
    input logic rst_n,
    input logic IN_clear,

    // AXI4-Lite Write channels (slave)
    // AW
    input logic [11:0] s_awaddr,
    input logic [2:0] s_awprot,
    input logic s_awvalid,
    output logic s_awready,
    // W
    input logic [`ISA_WIDTH-1:0] s_wdata,
    input logic [(`ISA_WIDTH/8)-1:0] s_wstrb,
    input logic s_wvalid,
    output logic s_wready,
    // B
    output logic [1:0] s_bresp,
    output logic s_bvalid,
    input logic s_bready,

    // To upper module (NPU_interface)
    output logic [`ISA_WIDTH-1:0] OUT_data,         // command word
    output logic                  OUT_valid,        // FIFO has data
    input  logic                  IN_decoder_ready  // upper module is ready to consume
);

  /*─────────────────────────────────────────────
  Register Address Map
  ───────────────────────────────────────────────*/
  localparam ADDR_INST = 12'h000;
  localparam ADDR_KICK = 12'h008;

  /*─────────────────────────────────────────────
  AXI4-Lite Write Path
  Latch AW first, write register when W arrives.
  ───────────────────────────────────────────────*/
  logic [          11:0] aw_addr_latch;
  logic                  aw_pending;
  logic                  bvalid_r;
  logic                  fifo_wen;
  logic [`ISA_WIDTH-1:0] fifo_wdata;

  // if queue is full block receive
  assign s_awready = ~aw_pending && ~cmd_q.full;
  assign s_wready  = aw_pending;
  assign s_bresp   = 2'b00;
  assign s_bvalid  = bvalid_r;

  // AW latch
  always_ff @(posedge clk) begin
    if (!rst_n || IN_clear) begin
      aw_addr_latch <= '0;
      aw_pending    <= 1'b0;
    end else begin
      if (s_awvalid && s_awready) begin
        aw_addr_latch <= s_awaddr;
        aw_pending    <= 1'b1;
      end
      if (s_wvalid && s_wready) aw_pending <= 1'b0;
    end
  end

  // W : push into FIFO + B response
  always_ff @(posedge clk) begin
    fifo_wen <= 1'b0;
    bvalid_r <= 1'b0;

    if (!rst_n || IN_clear) begin
      fifo_wdata <= '0;
    end else begin
      if (s_wvalid && s_wready) begin
        case (aw_addr_latch)
          // push instruction word into FIFO
          ADDR_INST: begin
            fifo_wdata <= s_wdata;
            fifo_wen   <= 1'b1;
          end
          // KICK : push a special marker (bit63 = 1 as kick flag)
          ADDR_KICK: begin
            fifo_wdata <= 64'h8000_0000_0000_0000;
            fifo_wen   <= 1'b1;
          end
          default: ;
        endcase
        bvalid_r <= 1'b1;
      end
      if (bvalid_r && s_bready) bvalid_r <= 1'b0;
    end
  end

  /*─────────────────────────────────────────────
  Command Queue  (simple synchronous, FIFO_DEPTH entries)
  Push : fifo_wen
  Pop  : OUT_valid && IN_decoder_ready
  ───────────────────────────────────────────────*/
  import algorithms_pkg::*;

  IF_queue #(
      .DATA_WIDTH(`ISA_WIDTH),
      .DEPTH(FIFO_DEPTH)
  ) cmd_q (
      .clk  (clk),
      .rst_n(rst_n)
  );
  QUEUE u_cmd_q (.q(cmd_q.owner));

  always_ff @(posedge clk) begin
    if (!rst_n || IN_clear) begin
      cmd_q.clear();
    end else begin
      if (fifo_wen) cmd_q.push(fifo_wdata);  // push when AXI write done
      if (OUT_valid && IN_decoder_ready) cmd_q.pop();  // IXED: o_valid -> OUT_valid
    end
  end

  assign OUT_valid = ~cmd_q.empty;
  assign OUT_data  = cmd_q.pop_data;

endmodule
AXIL_STAT_OUT.sv
`timescale 1ns / 1ps

// AXIL_STAT_OUT
// AXI4-Lite Read path : NPU → CPU
// Upper module pushes status into FIFO continuously.
// Drains FIFO to CPU when AXI4-Lite read handshake happens.

module AXIL_STAT_OUT #(
    parameter FIFO_DEPTH = 8
) (
    input logic clk,
    input logic rst_n,
    input logic IN_clear,

    // From upper module (NPU_interface → here)
    input logic [`ISA_WIDTH-1:0] IN_data,  // status word to send to CPU
    input logic                  IN_valid, // upper module has data to push

    // AXI4-Lite Read channels (slave)
    // AR
    input  logic [          11:0] s_araddr,
    input  logic                  s_arvalid,
    output logic                  s_arready,
    // R
    output logic [`ISA_WIDTH-1:0] s_rdata,
    output logic [           1:0] s_rresp,
    output logic                  s_rvalid,
    input  logic                  s_rready
);

  /*─────────────────────────────────────────────
  FIFO  (simple synchronous, FIFO_DEPTH entries)
  Push : IN_valid from upper module
  Pop  : AXI4-Lite read handshake with CPU
  ───────────────────────────────────────────────*/
  localparam PTR_W = $clog2(FIFO_DEPTH);

  logic [`ISA_WIDTH-1:0] mem[0:FIFO_DEPTH-1];
  logic [PTR_W:0] wr_ptr, rd_ptr;
  logic fifo_empty, fifo_full;

  assign fifo_empty = (wr_ptr == rd_ptr);
  assign fifo_full  = (wr_ptr[PTR_W] != rd_ptr[PTR_W]) && (wr_ptr[PTR_W-1:0] == rd_ptr[PTR_W-1:0]);

  logic fifo_ren;

  always_ff @(posedge clk) begin
    if (!rst_n || IN_clear) begin
      wr_ptr <= '0;
      rd_ptr <= '0;
    end else begin
      // push : upper module feeds status continuously
      if (IN_valid && !fifo_full) begin
        mem[wr_ptr[PTR_W-1:0]] <= IN_data;
        wr_ptr <= wr_ptr + 1'b1;
      end
      // pop : CPU consumed the data
      if (fifo_ren && !fifo_empty) rd_ptr <= rd_ptr + 1'b1;
    end
  end

  /*─────────────────────────────────────────────
  AXI4-Lite Read Path
  Wait for AR, then pop one entry from FIFO and return it.
  Hold rvalid until CPU acknowledges with rready.
  ───────────────────────────────────────────────*/
  logic [`ISA_WIDTH-1:0] rdata_r;
  logic                  rvalid_r;

  assign s_rdata   = rdata_r;
  assign s_rresp   = 2'b00;
  assign s_rvalid  = rvalid_r;
  assign s_arready = ~rvalid_r && ~fifo_empty;  // ready only when FIFO has data
  assign fifo_ren  = s_arvalid && s_arready;  // pop on AR handshake

  always_ff @(posedge clk) begin
    if (!rst_n || IN_clear) begin
      rdata_r  <= '0;
      rvalid_r <= 1'b0;
    end else begin
      // AR handshake → latch FIFO head and assert rvalid
      if (s_arvalid && s_arready) begin
        rdata_r  <= mem[rd_ptr[PTR_W-1:0]];
        rvalid_r <= 1'b1;
      end
      // R handshake → CPU consumed data, release
      if (rvalid_r && s_rready) rvalid_r <= 1'b0;
    end
  end

endmodule
ctrl_npu_interface.sv

Control Unit (decode + dispatch)

  • ctrl_decode_const.svh — decode-stage constants (field widths, masks).

  • ctrl_npu_decoder.sv — strips the 4-bit opcode from a 64-bit VLIW instruction and routes the 60-bit body to the correct FIFO.

  • ctrl_npu_dispatcher.sv — per-engine local dispatcher: pops a queued micro-op and fires the engine when operands are ready.

ctrl_decode_const.svh
`define MAX_CMD_CHAIN 4

// NPU Architecture
//`define ISA_WIDTH 32

// memcpy option flags

// memcpy
`define MEMCPY_INT4 2'b00
`define MEMCPY_INT8 2'b01
`define MEMCPY_INT16 2'b10
`define MEMCPY_BF16 2'b11


`define MEMCPY_OPT_INT_IS_SCALED 4'b1000


`define MEMCPY_FLAG_BF16_ALIGN 4'b1000
`define MEMCPY_FLAG_BF16_ALIGN_H 4'b0100
`define MEMCPY_FLAG_BF16_ALIGN_V 4'b0010
//`define MEMCPY_OPT_BF16_ 4'b0



`define DIM_X 1'b00
`define DIM_Y 1'b01
`define DIM_Z 2'b10
`define DIM_W 3'b11

`define MAX_MATRIX_DIM 4
`define MAX_MATRIX_WIDTH 32

//`define GEMV_LANE_0 4'b0000
`define GEMV_LANE_1 4'b0001
`define GEMV_LANE_2 4'b0010
`define GEMV_LANE_3 4'b0100
`define GEMV_LANE_4 4'b1000

`define MASKING_WEIGHT 2'b00
`define BUFFER_WEIGHT_A1 4'b0000
`define BUFFER_WEIGHT_A2 4'b0001
`define BUFFER_WEIGHT_A3 4'b0010
`define BUFFER_WEIGHT_A4 4'b0011

`define MASKING_SCALE 2'b01
`define BUFFER_SCALE 4'b0100
`define CACHE_SCALE 4'b0101

`define MASKING_FMAP 2'b10
`define BUFFER_FMAP_C 4'b1000
`define CACHE_FMAP_C1 4'b1001
`define CACHE_FMAP_C2 4'b1010
ctrl_npu_decoder.sv
`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| NPU Opcode Decoder |======================================================
// Receives raw 64-bit VLIW instructions from the frontend FIFO.
// Strips the 4-bit opcode, asserts the matching valid pulse for one cycle,
// and forwards the 60-bit body to the Global Scheduler.
// ===============================================================================

module ctrl_npu_decoder (
    input logic clk,
    input logic rst_n,

    // ===| From Frontend |=======================================================
    input logic [`ISA_WIDTH-1:0] IN_raw_instruction,
    input logic                  raw_instruction_pop_valid,

    // ===| Flow Control |========================================================
    output logic OUT_fetch_PC_ready,

    // ===| Decoded Valid Pulses (one-hot, one cycle) |===========================
    output logic OUT_GEMV_op_x64_valid,
    output logic OUT_GEMM_op_x64_valid,
    output logic OUT_memcpy_op_x64_valid,
    output logic OUT_memset_op_x64_valid,
    output logic OUT_cvo_op_x64_valid,

    // ===| Instruction Body (60-bit, opcode stripped) |=========================
    output instruction_op_x64_t OUT_op_x64
);

  // ===| Internal |==============================================================
  logic [3:0] OUT_valid;
  assign OUT_GEMV_op_x64_valid   = OUT_valid[0];
  assign OUT_GEMM_op_x64_valid   = OUT_valid[1];
  assign OUT_memcpy_op_x64_valid = OUT_valid[2];
  assign OUT_memset_op_x64_valid = OUT_valid[3];
  // CVO valid uses a separate FF (5th opcode)
  logic cvo_valid_ff;
  assign OUT_cvo_op_x64_valid = cvo_valid_ff;

  // ===| Opcode Decoder |========================================================
  // Top 4 bits are the opcode; bottom 60 bits are the instruction body.
  always_ff @(posedge clk) begin
    if (!rst_n) begin
      OUT_valid        <= 4'b0000;
      cvo_valid_ff     <= 1'b0;
      OUT_fetch_PC_ready <= `TRUE;
      OUT_op_x64       <= '0;
    end else begin
      OUT_valid      <= 4'b0000;
      cvo_valid_ff   <= 1'b0;

      if (raw_instruction_pop_valid) begin
        // Body: bits [59:0] (opcode at [63:60] already stripped by slicing)
        OUT_op_x64.instruction <= IN_raw_instruction[`ISA_BODY_WIDTH-1:0];

        case (IN_raw_instruction[`ISA_WIDTH-1:`ISA_WIDTH-`ISA_OPCODE_WIDTH])
          OP_GEMV:   OUT_valid <= 4'b0001;
          OP_GEMM:   OUT_valid <= 4'b0010;
          OP_MEMCPY: OUT_valid <= 4'b0100;
          OP_MEMSET: OUT_valid <= 4'b1000;
          OP_CVO:    cvo_valid_ff <= 1'b1;
          default:   ;  // unknown opcode: drop silently
        endcase
      end
    end
  end

  // ===| Backpressure |==========================================================
  // Always ready — the frontend FIFO provides buffering; the decoder is single-cycle.
  assign OUT_fetch_PC_ready = 1'b1;

endmodule
ctrl_npu_dispatcher.sv
// `timeOUT_scale 1ns / 1ps
// `include "GEMM_Array.svh"
// `include "npu_interfaces.svh"
// `include "GLOBAL_CONST.svh"

// import isa_pkg::*;

// module cu_npu_dispatcher (
//     input  logic         clk,
//     input  logic         rst_n,
//     input  instruction_t IN_inst,
//     input  logic         IN_valid,
//     output logic         o_valid,


//     // GEMV / GEMM controls
//     output logic [3:0] OUT_activate_top,
//     output logic [3:0] OUT_activate_lane,
//     output logic       OUT_result_emax_align,
//     output logic       OUT_result_accm,
//     output logic       OUT_result_scale,


//     // memcpy
//     output memory_uop_t OUT_memcpy_cmd,

//     // if INT group size?
// );

//   /*─────────────────────────────────────────────
//   Lane activation bitmask
//   bit[0]=lane1, bit[1]=lane2 ...
//   ─────────────────────────────────────────────*/
//   localparam logic [3:0] LANE_1 = 4'b0001;
//   localparam logic [3:0] LANE_2 = 4'b0010;
//   localparam logic [3:0] LANE_3 = 4'b0100;
//   localparam logic [3:0] LANE_4 = 4'b1000;

//   always_ff @(posedge clk) begin
//     if (!rst_n) begin
//       o_valid           <= 1'b0;
//       OUT_activate_lane     <= '0;
//       OUT_result_emax_align        <= 1'b0;
//       OUT_result_accm              <= 1'b0;
//       OUT_result_scale             <= 1'b0;
//       OUT_memcpy_destination_queue <= '0;
//       for (int i = 0; i < `MAX_MATRIX_DIM; i++) OUT_memcpy_matrix_shape[i] <= '0;
//     end else begin

//       o_valid <= 1'b0;  // default : deassert every cycle

//       if (IN_valid) begin
//         case (IN_inst.opcode)

//           OP_GEMV: begin
//             o_valid <= 1'b1;

//             if (IN_inst.cmd_chaining) begin
//               // TODO: chaining logic
//             end
//             if (IN_inst.override) begin
//               // TODO: override logic
//             end

//             // lane activation (OR mask, cumulative)
//             case (IN_inst.payload.dotm.lane_idq)
//               2'b00: OUT_activate_lane <= LANE_1;
//               2'b01: OUT_activate_lane <= LANE_1 | LANE_2;
//               2'b10: OUT_activate_lane <= LANE_1 | LANE_2 | LANE_3;
//               2'b11: OUT_activate_lane <= LANE_1 | LANE_2 | LANE_3 | LANE_4;
//               default: begin
//                 o_valid <= 1'b0;  // unknown → drop + TODO: interrupt
//               end
//             endcase

//             OUT_result_emax_align <= IN_inst.payload.dotm.find_emax_align;
//             OUT_result_accm       <= IN_inst.payload.dotm.OUT_result_accm;
//             // activate when added to ISA
//             // OUT_result_scale      <= IN_inst.payload.dotm.OUT_result_scale;
//             OUT_activate_top[`TOP_GEMV] <= `TRUE;
//           end

//           OP_GEMM: begin
//             if (IN_inst.override) begin
//               if (IN_inst.cmd_chaining) begin
//                 // TODO
//               end else begin
//                 // TODO
//               end
//             end else begin
//               if (IN_inst.cmd_chaining) begin
//                 // TODO
//               end else begin
//                 o_valid <= 1'b1;

//                 OUT_result_emax_align <= IN_inst.payload.dotm.align;
//                 OUT_result_accm       <= IN_inst.payload.dotm.OUT_result_accm;
//                 OUT_activate_top[`TOP_GEMV] <= `TRUE;
//               end
//             end

//           end

//           OP_MEMCPY: begin
//             if (IN_inst.override) begin
//               if (IN_inst.cmd_chaining) begin
//                 // accumulate matrix shape across chained instructions
//                 OUT_memcpy_matrix_shape[IN_inst.payload.memcpy.dim_xyz]
//                     <= IN_inst.payload.memcpy.dim_x;


//               end else begin
//                 // chaining end → dispatch memcpy
//                 o_valid           <= 1'b1;
//                 OUT_memcpy_destination_queue <= IN_inst.payload.memcpy.dest_queue;

//                 case (IN_inst.payload.memcpy.dest_queue[3:2])
//                   `MASKING_WEIGHT: begin
//                     // TODO: → weight buffer
//                   end
//                   `MASKING_OUT_scale: begin
//                     // TODO: ACP → OUT_result_scale cache
//                   end
//                   `MASKING_FMAP: begin
//                     // TODO: ACP → find emax & align → cache
//                   end
//                   default: o_valid <= 1'b0;  // undefined
//                 endcase
//               end

//             end else begin
//               // non-override memcpy
//               // TODO
//             end

//             // Determine logic based on datatype and mask IN_inst.payload.memcpy.option_flags using bitwise AND (&)
//             if (IN_inst.payload.memcpy.datatype == `BF16) begin
//                 // Example: BF16 processing mode
//                 // Check if the 4th bit (ALIGN) is set to 1
//                 if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN) != 4'b0000) begin
//                     OUT_align <= `TRUE;

//                     // Determine the alignment direction
//                     if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN_V) != 4'b0000) begin
//                         OUT_align_dir <= `ALIGN_VERTICAL;
//                     end else if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN_H) != 4'b0000) begin
//                         OUT_align_dir <= `ALIGN_HORIZONTAL;
//                     end else begin
//                         // Default direction if neither V nor H is specified
//                     end

//                 end else begin
//                     // If ALIGN flag is missing
//                     OUT_align <= `FALSE;
//                 end
//             end else begin
//                 // Example: INT processing mode
//                 if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_OPT_INT_IS_SCALED) != 4'b0000) begin
//                     // Logic for scaled INT
//                     OUT_align <= `TRUE; // (Adjust according to your actual spec)
//                 end else begin
//                     OUT_align <= `FALSE;
//                 end
//             end

//             OUT_datatype <= IN_inst.payload.memcpy.datatype;

//           end
//           default: o_valid <= 1'b0;  // unknown opcode → drop
//         endcase
//       end
//     end
//   end

// endmodule

Global Scheduler + controller top

  • Global_Scheduler.sv — cross-engine arbitration: orders memory transfers vs. compute, handles ACC / FINDEMAX hazards.

  • npu_controller_top.sv — controller top-level wrapper; instantiates frontend + decode/dispatch + scheduler and wires them to the npu_if bundle.

Global_Scheduler.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| Global Scheduler |========================================================
// Translates decoded VLIW instructions into engine micro-ops.
//
// Single always_ff drives each output to avoid multiple-driver conflicts.
// Priority for OUT_LOAD_uop: GEMM > GEMV > MEMCPY > CVO (one active per cycle).
//
// OUT_STORE_uop  : registered at issue time; mem_dispatcher uses it to initiate
//                  result writeback after the engine signals completion.
// OUT_sram_rd_start : one-cycle pulse when a GEMM or GEMV load is dispatched,
//                     triggering preprocess_fmap to begin broadcasting from cache.
// ===============================================================================

module Global_Scheduler #() (
    input logic clk_core,
    input logic rst_n_core,

    // ===| From ctrl_npu_decoder |===============================================
    input logic IN_GEMV_op_x64_valid,
    input logic IN_GEMM_op_x64_valid,
    input logic IN_memcpy_op_x64_valid,
    input logic IN_memset_op_x64_valid,
    input logic IN_cvo_op_x64_valid,

    input instruction_op_x64_t instruction,

    // ===| Engine micro-ops |====================================================
    output gemm_control_uop_t   OUT_GEMM_uop,
    output GEMV_control_uop_t   OUT_GEMV_uop,
    output memory_control_uop_t OUT_LOAD_uop,
    output memory_control_uop_t OUT_STORE_uop,
    output memory_set_uop_t     OUT_mem_set_uop,
    output cvo_control_uop_t    OUT_CVO_uop,

    // ===| Datapath control |====================================================
    output logic OUT_sram_rd_start   // pulse: start fmap cache broadcast
);

  // ===| Combinational instruction body casts |==================================
  GEMV_op_x64_t   GEMV_op_x64;
  GEMM_op_x64_t   GEMM_op_x64;
  memcpy_op_x64_t memcpy_op_x64;
  memset_op_x64_t memset_op_x64;
  cvo_op_x64_t    cvo_op_x64;

  always_comb begin
    GEMV_op_x64   = GEMV_op_x64_t'(instruction.instruction);
    GEMM_op_x64   = GEMM_op_x64_t'(instruction.instruction);
    memcpy_op_x64 = memcpy_op_x64_t'(instruction.instruction);
    memset_op_x64 = memset_op_x64_t'(instruction.instruction);
    cvo_op_x64    = cvo_op_x64_t'(instruction.instruction);
  end

  // ===| MEMSET uop |============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_mem_set_uop <= '0;
    end else if (IN_memset_op_x64_valid) begin
      OUT_mem_set_uop <= '{
          dest_cache : dest_cache_e'(memset_op_x64.dest_cache),
          dest_addr  : memset_op_x64.dest_addr,
          a_value    : memset_op_x64.a_value,
          b_value    : memset_op_x64.b_value,
          c_value    : memset_op_x64.c_value
      };
    end
  end

  // ===| MEMCPY route translation ===============================================
  // from_device/to_device (1-bit each) → data_route_e (8-bit enum)
  data_route_e memcpy_route;
  always_comb begin
    if (memcpy_op_x64.from_device == FROM_HOST && memcpy_op_x64.to_device == TO_NPU)
      memcpy_route = from_host_to_L2;
    else
      memcpy_route = from_L2_to_host;
  end

  // ===| LOAD uop — single driver (priority: GEMM > GEMV > MEMCPY > CVO) |======
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_LOAD_uop      <= '0;
      OUT_sram_rd_start <= 1'b0;
    end else begin
      OUT_sram_rd_start <= 1'b0;   // default: no pulse

      if (IN_GEMM_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_L1_GEMM,
            dest_addr      : '0,
            src_addr       : GEMM_op_x64.src_addr,
            shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
            async          : SYNC_OP
        };
        OUT_sram_rd_start <= 1'b1;

      end else if (IN_GEMV_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_L1_GEMV,
            dest_addr      : '0,
            src_addr       : GEMV_op_x64.src_addr,
            shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
            async          : SYNC_OP
        };
        OUT_sram_rd_start <= 1'b1;

      end else if (IN_memcpy_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : memcpy_route,
            dest_addr      : memcpy_op_x64.dest_addr,
            src_addr       : memcpy_op_x64.src_addr,
            shape_ptr_addr : memcpy_op_x64.shape_ptr_addr,
            async          : memcpy_op_x64.async
        };

      end else if (IN_cvo_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_CVO,
            dest_addr      : '0,
            src_addr       : cvo_op_x64.src_addr,
            shape_ptr_addr : '0,
            async          : cvo_op_x64.async
        };
      end
    end
  end

  // ===| STORE uop — latched at issue time |=====================================
  // Held until the engine signals completion (external handshake, not shown here).
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_STORE_uop <= '0;
    end else if (IN_GEMM_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_GEMM_res_to_L2,
          dest_addr      : GEMM_op_x64.dest_reg,
          src_addr       : '0,
          shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
          async          : SYNC_OP
      };
    end else if (IN_GEMV_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_GEMV_res_to_L2,
          dest_addr      : GEMV_op_x64.dest_reg,
          src_addr       : '0,
          shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
          async          : SYNC_OP
      };
    end else if (IN_cvo_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_CVO_res_to_L2,
          dest_addr      : cvo_op_x64.dst_addr,
          src_addr       : '0,
          shape_ptr_addr : '0,
          async          : cvo_op_x64.async
      };
    end
  end

  // ===| GEMM uop |==============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_GEMM_uop <= '0;
    end else if (IN_GEMM_op_x64_valid) begin
      OUT_GEMM_uop <= '{
          flags         : GEMM_op_x64.flags,
          size_ptr_addr : GEMM_op_x64.size_ptr_addr,
          parallel_lane : GEMM_op_x64.parallel_lane
      };
    end
  end

  // ===| GEMV uop |==============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_GEMV_uop <= '0;
    end else if (IN_GEMV_op_x64_valid) begin
      OUT_GEMV_uop <= '{
          flags         : GEMV_op_x64.flags,
          size_ptr_addr : GEMV_op_x64.size_ptr_addr,
          parallel_lane : GEMV_op_x64.parallel_lane
      };
    end
  end

  // ===| CVO uop |===============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_CVO_uop <= '0;
    end else if (IN_cvo_op_x64_valid) begin
      OUT_CVO_uop <= '{
          cvo_func : cvo_func_e'(cvo_op_x64.cvo_func),
          src_addr : cvo_op_x64.src_addr,
          dst_addr : cvo_op_x64.dst_addr,
          length   : cvo_op_x64.length,
          flags    : cvo_flags_t'(cvo_op_x64.flags),
          async    : cvo_op_x64.async
      };
    end
  end

endmodule
npu_controller_top.sv
`timescale 1ns / 1ps

`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| NPU Controller Top |======================================================
// Wraps the AXI-Lite frontend and the opcode decoder.
// Outputs one valid pulse per instruction type along with the raw 60-bit body.
// ===============================================================================

module npu_controller_top #() (
    input logic clk,
    input logic rst_n,
    input logic i_clear,

    // ===| AXI4-Lite Slave : PS <-> NPU control plane |=========================
    axil_if.slave S_AXIL_CTRL,

    // ===| Decoded Instruction Valids |=========================================
    output logic OUT_GEMV_op_x64_valid,
    output logic OUT_GEMM_op_x64_valid,
    output logic OUT_memcpy_op_x64_valid,
    output logic OUT_memset_op_x64_valid,
    output logic OUT_cvo_op_x64_valid,

    // ===| Raw Instruction Body (60-bit, opcode stripped) |=====================
    output instruction_op_x64_t OUT_op_x64
);

  // ===| Internal Wires |========================================================
  logic [`ISA_WIDTH-1:0] raw_instruction;
  logic                  raw_instruction_pop_valid;
  logic                  fetch_PC_ready;

  // ===| Frontend : AXI-Lite CMD/STAT |==========================================
  ctrl_npu_frontend #() u_npu_frontend (
      .clk     (clk),
      .rst_n   (rst_n),
      .IN_clear(i_clear),

      .S_AXIL_CTRL(S_AXIL_CTRL),

      .OUT_RAW_instruction(raw_instruction),
      .OUT_kick           (raw_instruction_pop_valid),

      .IN_enc_stat ('0),
      .IN_enc_valid(1'b0),

      .IN_fetch_ready(fetch_PC_ready)
  );

  // ===| Decoder : Opcode -> Engine FIFOs |======================================
  ctrl_npu_decoder u_decoder (
      .clk                    (clk),
      .rst_n                  (rst_n),
      .IN_raw_instruction     (raw_instruction),
      .raw_instruction_pop_valid(raw_instruction_pop_valid),

      .OUT_fetch_PC_ready     (fetch_PC_ready),

      .OUT_GEMV_op_x64_valid  (OUT_GEMV_op_x64_valid),
      .OUT_GEMM_op_x64_valid  (OUT_GEMM_op_x64_valid),
      .OUT_memcpy_op_x64_valid(OUT_memcpy_op_x64_valid),
      .OUT_memset_op_x64_valid(OUT_memset_op_x64_valid),
      .OUT_cvo_op_x64_valid   (OUT_cvo_op_x64_valid),

      .OUT_op_x64(OUT_op_x64)
  );

endmodule

FSM Out Logic (status aggregation)

  • fsmout_npu_stat_collector.sv — samples per-engine busy/done flags.

  • fsmout_npu_stat_encoder.sv — encodes collected flags into the 32-bit status register exposed by AXIL_STAT_OUT.

fsmout_npu_stat_collector.sv

fsmout_npu_stat_encoder.sv