NPU Controller Modules

RTL source on GitHub

SystemVerilog sources documented on this page:

  • hw/rtl/NPU_Controller/npu_controller_top.svView on GitHub

  • hw/rtl/NPU_Controller/NPU_Control_Unit/ctrl_npu_decoder.svView on GitHub

  • hw/rtl/NPU_Controller/NPU_Control_Unit/ctrl_npu_dispatcher.svView on GitHub

  • hw/rtl/NPU_Controller/Global_Scheduler.svView on GitHub

1. Controller Top

npu_controller_top.sv integrates the AXI-Lite frontend, instruction decoder, dispatcher, and global scheduler into a single unit.

Listing 7 hw/rtl/NPU_Controller/npu_controller_top.sv
`timescale 1ns / 1ps

`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| NPU Controller Top |======================================================
// Wraps the AXI-Lite frontend and the opcode decoder.
// Outputs one valid pulse per instruction type along with the raw 60-bit body.
// ===============================================================================

module npu_controller_top #() (
    input logic clk,
    input logic rst_n,
    input logic i_clear,

    // ===| AXI4-Lite Slave : PS <-> NPU control plane |=========================
    axil_if.slave S_AXIL_CTRL,

    // ===| Decoded Instruction Valids |=========================================
    output logic OUT_GEMV_op_x64_valid,
    output logic OUT_GEMM_op_x64_valid,
    output logic OUT_memcpy_op_x64_valid,
    output logic OUT_memset_op_x64_valid,
    output logic OUT_cvo_op_x64_valid,

    // ===| Raw Instruction Body (60-bit, opcode stripped) |=====================
    output instruction_op_x64_t OUT_op_x64
);

  // ===| Internal Wires |========================================================
  logic [`ISA_WIDTH-1:0] raw_instruction;
  logic                  raw_instruction_pop_valid;
  logic                  fetch_PC_ready;

  // ===| Frontend : AXI-Lite CMD/STAT |==========================================
  ctrl_npu_frontend #() u_npu_frontend (
      .clk     (clk),
      .rst_n   (rst_n),
      .IN_clear(i_clear),

      .S_AXIL_CTRL(S_AXIL_CTRL),

      .OUT_RAW_instruction(raw_instruction),
      .OUT_kick           (raw_instruction_pop_valid),

      .IN_enc_stat ('0),
      .IN_enc_valid(1'b0),

      .IN_fetch_ready(fetch_PC_ready)
  );

  // ===| Decoder : Opcode -> Engine FIFOs |======================================
  ctrl_npu_decoder u_decoder (
      .clk                    (clk),
      .rst_n                  (rst_n),
      .IN_raw_instruction     (raw_instruction),
      .raw_instruction_pop_valid(raw_instruction_pop_valid),

      .OUT_fetch_PC_ready     (fetch_PC_ready),

      .OUT_GEMV_op_x64_valid  (OUT_GEMV_op_x64_valid),
      .OUT_GEMM_op_x64_valid  (OUT_GEMM_op_x64_valid),
      .OUT_memcpy_op_x64_valid(OUT_memcpy_op_x64_valid),
      .OUT_memset_op_x64_valid(OUT_memset_op_x64_valid),
      .OUT_cvo_op_x64_valid   (OUT_cvo_op_x64_valid),

      .OUT_op_x64(OUT_op_x64)
  );

endmodule

2. Instruction Decoder

ctrl_npu_decoder.sv parses the 64-bit VLIW instruction word: strips the 4-bit opcode and routes the 60-bit body into the appropriate typed struct (GEMV_op_x64_t, memcpy_op_x64_t, etc.).

Listing 8 hw/rtl/NPU_Controller/NPU_Control_Unit/ctrl_npu_decoder.sv
`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| NPU Opcode Decoder |======================================================
// Receives raw 64-bit VLIW instructions from the frontend FIFO.
// Strips the 4-bit opcode, asserts the matching valid pulse for one cycle,
// and forwards the 60-bit body to the Global Scheduler.
// ===============================================================================

module ctrl_npu_decoder (
    input logic clk,
    input logic rst_n,

    // ===| From Frontend |=======================================================
    input logic [`ISA_WIDTH-1:0] IN_raw_instruction,
    input logic                  raw_instruction_pop_valid,

    // ===| Flow Control |========================================================
    output logic OUT_fetch_PC_ready,

    // ===| Decoded Valid Pulses (one-hot, one cycle) |===========================
    output logic OUT_GEMV_op_x64_valid,
    output logic OUT_GEMM_op_x64_valid,
    output logic OUT_memcpy_op_x64_valid,
    output logic OUT_memset_op_x64_valid,
    output logic OUT_cvo_op_x64_valid,

    // ===| Instruction Body (60-bit, opcode stripped) |=========================
    output instruction_op_x64_t OUT_op_x64
);

  // ===| Internal |==============================================================
  logic [3:0] OUT_valid;
  assign OUT_GEMV_op_x64_valid   = OUT_valid[0];
  assign OUT_GEMM_op_x64_valid   = OUT_valid[1];
  assign OUT_memcpy_op_x64_valid = OUT_valid[2];
  assign OUT_memset_op_x64_valid = OUT_valid[3];
  // CVO valid uses a separate FF (5th opcode)
  logic cvo_valid_ff;
  assign OUT_cvo_op_x64_valid = cvo_valid_ff;

  // ===| Opcode Decoder |========================================================
  // Top 4 bits are the opcode; bottom 60 bits are the instruction body.
  always_ff @(posedge clk) begin
    if (!rst_n) begin
      OUT_valid        <= 4'b0000;
      cvo_valid_ff     <= 1'b0;
      OUT_fetch_PC_ready <= `TRUE;
      OUT_op_x64       <= '0;
    end else begin
      OUT_valid      <= 4'b0000;
      cvo_valid_ff   <= 1'b0;

      if (raw_instruction_pop_valid) begin
        // Body: bits [59:0] (opcode at [63:60] already stripped by slicing)
        OUT_op_x64.instruction <= IN_raw_instruction[`ISA_BODY_WIDTH-1:0];

        case (IN_raw_instruction[`ISA_WIDTH-1:`ISA_WIDTH-`ISA_OPCODE_WIDTH])
          OP_GEMV:   OUT_valid <= 4'b0001;
          OP_GEMM:   OUT_valid <= 4'b0010;
          OP_MEMCPY: OUT_valid <= 4'b0100;
          OP_MEMSET: OUT_valid <= 4'b1000;
          OP_CVO:    cvo_valid_ff <= 1'b1;
          default:   ;  // unknown opcode: drop silently
        endcase
      end
    end
  end

  // ===| Backpressure |==========================================================
  // Always ready — the frontend FIFO provides buffering; the decoder is single-cycle.
  assign OUT_fetch_PC_ready = 1'b1;

endmodule

3. Instruction Dispatcher

ctrl_npu_dispatcher.sv resolves Constant Cache pointer lookups (shape / size / scale), checks for address and resource hazards, and issues per-core control μops to GEMM, GEMV, CVO, and mem_dispatcher.

Listing 9 hw/rtl/NPU_Controller/NPU_Control_Unit/ctrl_npu_dispatcher.sv
// `timeOUT_scale 1ns / 1ps
// `include "GEMM_Array.svh"
// `include "npu_interfaces.svh"
// `include "GLOBAL_CONST.svh"

// import isa_pkg::*;

// module cu_npu_dispatcher (
//     input  logic         clk,
//     input  logic         rst_n,
//     input  instruction_t IN_inst,
//     input  logic         IN_valid,
//     output logic         o_valid,


//     // GEMV / GEMM controls
//     output logic [3:0] OUT_activate_top,
//     output logic [3:0] OUT_activate_lane,
//     output logic       OUT_result_emax_align,
//     output logic       OUT_result_accm,
//     output logic       OUT_result_scale,


//     // memcpy
//     output memory_uop_t OUT_memcpy_cmd,

//     // if INT group size?
// );

//   /*─────────────────────────────────────────────
//   Lane activation bitmask
//   bit[0]=lane1, bit[1]=lane2 ...
//   ─────────────────────────────────────────────*/
//   localparam logic [3:0] LANE_1 = 4'b0001;
//   localparam logic [3:0] LANE_2 = 4'b0010;
//   localparam logic [3:0] LANE_3 = 4'b0100;
//   localparam logic [3:0] LANE_4 = 4'b1000;

//   always_ff @(posedge clk) begin
//     if (!rst_n) begin
//       o_valid           <= 1'b0;
//       OUT_activate_lane     <= '0;
//       OUT_result_emax_align        <= 1'b0;
//       OUT_result_accm              <= 1'b0;
//       OUT_result_scale             <= 1'b0;
//       OUT_memcpy_destination_queue <= '0;
//       for (int i = 0; i < `MAX_MATRIX_DIM; i++) OUT_memcpy_matrix_shape[i] <= '0;
//     end else begin

//       o_valid <= 1'b0;  // default : deassert every cycle

//       if (IN_valid) begin
//         case (IN_inst.opcode)

//           OP_GEMV: begin
//             o_valid <= 1'b1;

//             if (IN_inst.cmd_chaining) begin
//               // TODO: chaining logic
//             end
//             if (IN_inst.override) begin
//               // TODO: override logic
//             end

//             // lane activation (OR mask, cumulative)
//             case (IN_inst.payload.dotm.lane_idq)
//               2'b00: OUT_activate_lane <= LANE_1;
//               2'b01: OUT_activate_lane <= LANE_1 | LANE_2;
//               2'b10: OUT_activate_lane <= LANE_1 | LANE_2 | LANE_3;
//               2'b11: OUT_activate_lane <= LANE_1 | LANE_2 | LANE_3 | LANE_4;
//               default: begin
//                 o_valid <= 1'b0;  // unknown → drop + TODO: interrupt
//               end
//             endcase

//             OUT_result_emax_align <= IN_inst.payload.dotm.find_emax_align;
//             OUT_result_accm       <= IN_inst.payload.dotm.OUT_result_accm;
//             // activate when added to ISA
//             // OUT_result_scale      <= IN_inst.payload.dotm.OUT_result_scale;
//             OUT_activate_top[`TOP_GEMV] <= `TRUE;
//           end

//           OP_GEMM: begin
//             if (IN_inst.override) begin
//               if (IN_inst.cmd_chaining) begin
//                 // TODO
//               end else begin
//                 // TODO
//               end
//             end else begin
//               if (IN_inst.cmd_chaining) begin
//                 // TODO
//               end else begin
//                 o_valid <= 1'b1;

//                 OUT_result_emax_align <= IN_inst.payload.dotm.align;
//                 OUT_result_accm       <= IN_inst.payload.dotm.OUT_result_accm;
//                 OUT_activate_top[`TOP_GEMV] <= `TRUE;
//               end
//             end

//           end

//           OP_MEMCPY: begin
//             if (IN_inst.override) begin
//               if (IN_inst.cmd_chaining) begin
//                 // accumulate matrix shape across chained instructions
//                 OUT_memcpy_matrix_shape[IN_inst.payload.memcpy.dim_xyz]
//                     <= IN_inst.payload.memcpy.dim_x;


//               end else begin
//                 // chaining end → dispatch memcpy
//                 o_valid           <= 1'b1;
//                 OUT_memcpy_destination_queue <= IN_inst.payload.memcpy.dest_queue;

//                 case (IN_inst.payload.memcpy.dest_queue[3:2])
//                   `MASKING_WEIGHT: begin
//                     // TODO: → weight buffer
//                   end
//                   `MASKING_OUT_scale: begin
//                     // TODO: ACP → OUT_result_scale cache
//                   end
//                   `MASKING_FMAP: begin
//                     // TODO: ACP → find emax & align → cache
//                   end
//                   default: o_valid <= 1'b0;  // undefined
//                 endcase
//               end

//             end else begin
//               // non-override memcpy
//               // TODO
//             end

//             // Determine logic based on datatype and mask IN_inst.payload.memcpy.option_flags using bitwise AND (&)
//             if (IN_inst.payload.memcpy.datatype == `BF16) begin
//                 // Example: BF16 processing mode
//                 // Check if the 4th bit (ALIGN) is set to 1
//                 if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN) != 4'b0000) begin
//                     OUT_align <= `TRUE;

//                     // Determine the alignment direction
//                     if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN_V) != 4'b0000) begin
//                         OUT_align_dir <= `ALIGN_VERTICAL;
//                     end else if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_FLAG_BF16_ALIGN_H) != 4'b0000) begin
//                         OUT_align_dir <= `ALIGN_HORIZONTAL;
//                     end else begin
//                         // Default direction if neither V nor H is specified
//                     end

//                 end else begin
//                     // If ALIGN flag is missing
//                     OUT_align <= `FALSE;
//                 end
//             end else begin
//                 // Example: INT processing mode
//                 if ((IN_inst.payload.memcpy.option_flags & `MEMCPY_OPT_INT_IS_SCALED) != 4'b0000) begin
//                     // Logic for scaled INT
//                     OUT_align <= `TRUE; // (Adjust according to your actual spec)
//                 end else begin
//                     OUT_align <= `FALSE;
//                 end
//             end

//             OUT_datatype <= IN_inst.payload.memcpy.datatype;

//           end
//           default: o_valid <= 1'b0;  // unknown opcode → drop
//         endcase
//       end
//     end
//   end

// endmodule

4. Global Scheduler

Global_Scheduler.sv tracks in-flight async instructions, maintains the dependency scoreboard, and gates new dispatches when a hazard is detected.

Listing 10 hw/rtl/NPU_Controller/Global_Scheduler.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| Global Scheduler |========================================================
// Translates decoded VLIW instructions into engine micro-ops.
//
// Single always_ff drives each output to avoid multiple-driver conflicts.
// Priority for OUT_LOAD_uop: GEMM > GEMV > MEMCPY > CVO (one active per cycle).
//
// OUT_STORE_uop  : registered at issue time; mem_dispatcher uses it to initiate
//                  result writeback after the engine signals completion.
// OUT_sram_rd_start : one-cycle pulse when a GEMM or GEMV load is dispatched,
//                     triggering preprocess_fmap to begin broadcasting from cache.
// ===============================================================================

module Global_Scheduler #() (
    input logic clk_core,
    input logic rst_n_core,

    // ===| From ctrl_npu_decoder |===============================================
    input logic IN_GEMV_op_x64_valid,
    input logic IN_GEMM_op_x64_valid,
    input logic IN_memcpy_op_x64_valid,
    input logic IN_memset_op_x64_valid,
    input logic IN_cvo_op_x64_valid,

    input instruction_op_x64_t instruction,

    // ===| Engine micro-ops |====================================================
    output gemm_control_uop_t   OUT_GEMM_uop,
    output GEMV_control_uop_t   OUT_GEMV_uop,
    output memory_control_uop_t OUT_LOAD_uop,
    output memory_control_uop_t OUT_STORE_uop,
    output memory_set_uop_t     OUT_mem_set_uop,
    output cvo_control_uop_t    OUT_CVO_uop,

    // ===| Datapath control |====================================================
    output logic OUT_sram_rd_start   // pulse: start fmap cache broadcast
);

  // ===| Combinational instruction body casts |==================================
  GEMV_op_x64_t   GEMV_op_x64;
  GEMM_op_x64_t   GEMM_op_x64;
  memcpy_op_x64_t memcpy_op_x64;
  memset_op_x64_t memset_op_x64;
  cvo_op_x64_t    cvo_op_x64;

  always_comb begin
    GEMV_op_x64   = GEMV_op_x64_t'(instruction.instruction);
    GEMM_op_x64   = GEMM_op_x64_t'(instruction.instruction);
    memcpy_op_x64 = memcpy_op_x64_t'(instruction.instruction);
    memset_op_x64 = memset_op_x64_t'(instruction.instruction);
    cvo_op_x64    = cvo_op_x64_t'(instruction.instruction);
  end

  // ===| MEMSET uop |============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_mem_set_uop <= '0;
    end else if (IN_memset_op_x64_valid) begin
      OUT_mem_set_uop <= '{
          dest_cache : dest_cache_e'(memset_op_x64.dest_cache),
          dest_addr  : memset_op_x64.dest_addr,
          a_value    : memset_op_x64.a_value,
          b_value    : memset_op_x64.b_value,
          c_value    : memset_op_x64.c_value
      };
    end
  end

  // ===| MEMCPY route translation ===============================================
  // from_device/to_device (1-bit each) → data_route_e (8-bit enum)
  data_route_e memcpy_route;
  always_comb begin
    if (memcpy_op_x64.from_device == FROM_HOST && memcpy_op_x64.to_device == TO_NPU)
      memcpy_route = from_host_to_L2;
    else
      memcpy_route = from_L2_to_host;
  end

  // ===| LOAD uop — single driver (priority: GEMM > GEMV > MEMCPY > CVO) |======
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_LOAD_uop      <= '0;
      OUT_sram_rd_start <= 1'b0;
    end else begin
      OUT_sram_rd_start <= 1'b0;   // default: no pulse

      if (IN_GEMM_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_L1_GEMM,
            dest_addr      : '0,
            src_addr       : GEMM_op_x64.src_addr,
            shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
            async          : SYNC_OP
        };
        OUT_sram_rd_start <= 1'b1;

      end else if (IN_GEMV_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_L1_GEMV,
            dest_addr      : '0,
            src_addr       : GEMV_op_x64.src_addr,
            shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
            async          : SYNC_OP
        };
        OUT_sram_rd_start <= 1'b1;

      end else if (IN_memcpy_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : memcpy_route,
            dest_addr      : memcpy_op_x64.dest_addr,
            src_addr       : memcpy_op_x64.src_addr,
            shape_ptr_addr : memcpy_op_x64.shape_ptr_addr,
            async          : memcpy_op_x64.async
        };

      end else if (IN_cvo_op_x64_valid) begin
        OUT_LOAD_uop <= '{
            data_dest      : from_L2_to_CVO,
            dest_addr      : '0,
            src_addr       : cvo_op_x64.src_addr,
            shape_ptr_addr : '0,
            async          : cvo_op_x64.async
        };
      end
    end
  end

  // ===| STORE uop — latched at issue time |=====================================
  // Held until the engine signals completion (external handshake, not shown here).
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_STORE_uop <= '0;
    end else if (IN_GEMM_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_GEMM_res_to_L2,
          dest_addr      : GEMM_op_x64.dest_reg,
          src_addr       : '0,
          shape_ptr_addr : GEMM_op_x64.shape_ptr_addr,
          async          : SYNC_OP
      };
    end else if (IN_GEMV_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_GEMV_res_to_L2,
          dest_addr      : GEMV_op_x64.dest_reg,
          src_addr       : '0,
          shape_ptr_addr : GEMV_op_x64.shape_ptr_addr,
          async          : SYNC_OP
      };
    end else if (IN_cvo_op_x64_valid) begin
      OUT_STORE_uop <= '{
          data_dest      : from_CVO_res_to_L2,
          dest_addr      : cvo_op_x64.dst_addr,
          src_addr       : '0,
          shape_ptr_addr : '0,
          async          : cvo_op_x64.async
      };
    end
  end

  // ===| GEMM uop |==============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_GEMM_uop <= '0;
    end else if (IN_GEMM_op_x64_valid) begin
      OUT_GEMM_uop <= '{
          flags         : GEMM_op_x64.flags,
          size_ptr_addr : GEMM_op_x64.size_ptr_addr,
          parallel_lane : GEMM_op_x64.parallel_lane
      };
    end
  end

  // ===| GEMV uop |==============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_GEMV_uop <= '0;
    end else if (IN_GEMV_op_x64_valid) begin
      OUT_GEMV_uop <= '{
          flags         : GEMV_op_x64.flags,
          size_ptr_addr : GEMV_op_x64.size_ptr_addr,
          parallel_lane : GEMV_op_x64.parallel_lane
      };
    end
  end

  // ===| CVO uop |===============================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      OUT_CVO_uop <= '0;
    end else if (IN_cvo_op_x64_valid) begin
      OUT_CVO_uop <= '{
          cvo_func : cvo_func_e'(cvo_op_x64.cvo_func),
          src_addr : cvo_op_x64.src_addr,
          dst_addr : cvo_op_x64.dst_addr,
          length   : cvo_op_x64.length,
          flags    : cvo_flags_t'(cvo_op_x64.flags),
          async    : cvo_op_x64.async
      };
    end
  end

endmodule

See also

Per-Instruction Dataflow — dependency and completion tracking.