Memory Control¶

All path-plumbing between the host DDR, the L2 URAM cache, the per-core L1 caches, and the CVO stream ingest. A dedicated dispatcher arbitrates the eight enumerated data_route_e routes (see the ISA §5) and the two constant-memory arrays feed the Matrix / Vector cores with shape and size descriptors.

Top-level plumbing¶

mem_dispatcher.sv — central arbiter: picks one of eight routes per cycle and drives the corresponding source / destination pair.
mem_L2_cache_fmap.sv — L2 URAM cache dedicated to feature maps (114,688 × 128-bit).
mem_HP_buffer.sv — double-buffered queue between the HP-AXI slaves and the compute-core weight FIFOs.
mem_CVO_stream_bridge.sv — bridge from the L2 cache into the CVO core’s streaming input.

mem_dispatcher.sv

`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"
`include "mem_IO.svh"

import isa_pkg::*;

// ===| Memory Dispatcher |=======================================================
// Translates engine uops into L2 cache commands and routes data streams.
//
// Responsibilities:
//   - Shape constant RAM (MEMSET: fmap / weight array shapes)
//   - ACP DMA path   : host DDR4 ↔ L2 (MEMCPY host↔NPU)
//   - NPU burst path : L2 → GEMM fmap / GEMV fmap broadcast
//   - CVO stream     : L2 → CVO engine input; CVO output → L2
//                      (via mem_CVO_stream_bridge)
//
// Address convention for L2: 128-bit word units (word N = bytes [16N..16N+15]).
// ===============================================================================

module mem_dispatcher #() (
    input logic clk_core,
    input logic rst_n_core,
    input logic clk_axi,
    input logic rst_axi_n,

    // ===| AXI-Stream ACP (external) |==========================================
    axis_if.slave  S_AXIS_ACP_FMAP,
    axis_if.master M_AXIS_ACP_RESULT,

    // ===| Engine uop inputs |===================================================
    input  memory_control_uop_t IN_LOAD_uop,
    input  memory_set_uop_t     IN_mem_set_uop,
    input  cvo_control_uop_t    IN_CVO_uop,
    input  logic                IN_cvo_uop_valid,

    // ===| CVO streaming ports (to/from CVO_top) |==============================
    output logic [15:0] OUT_cvo_data,
    output logic        OUT_cvo_valid,
    input  logic        IN_cvo_data_ready,

    input  logic [15:0] IN_cvo_result,
    input  logic        IN_cvo_result_valid,
    output logic        OUT_cvo_result_ready,

    // ===| Status |=============================================================
    output logic OUT_fifo_full,
    output logic OUT_cvo_busy
);

  // ===| FIFO full aggregation |=================================================
  logic acp_cmd_fifo_full;
  logic npu_cmd_fifo_full;
  logic cvo_bridge_busy;

  assign OUT_fifo_full = acp_cmd_fifo_full | npu_cmd_fifo_full;
  assign OUT_cvo_busy  = cvo_bridge_busy;

  // ===| Shape Constant RAM — FMap |=============================================
  logic        fmap_write_enable;
  logic [ 5:0] fmap_shape_read_address;
  logic [16:0] fmap_arr_shape_X;
  logic [16:0] fmap_arr_shape_Y;
  logic [16:0] fmap_arr_shape_Z;
  logic [16:0] fmap_read_arr_shape_X;
  logic [16:0] fmap_read_arr_shape_Y;
  logic [16:0] fmap_read_arr_shape_Z;

  // ===| Shape Constant RAM — Weight |===========================================
  logic        weight_write_enable;
  logic [ 5:0] weight_shape_read_address;
  logic [16:0] weight_arr_shape_X;
  logic [16:0] weight_arr_shape_Y;
  logic [16:0] weight_arr_shape_Z;
  logic [16:0] weight_read_arr_shape_X;
  logic [16:0] weight_read_arr_shape_Y;
  logic [16:0] weight_read_arr_shape_Z;

  // ===| MEMSET handler |========================================================
  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      fmap_write_enable   <= 1'b0;
      weight_write_enable <= 1'b0;
    end else begin
      fmap_write_enable   <= 1'b0;
      weight_write_enable <= 1'b0;

      case (IN_mem_set_uop.dest_cache)
        data_to_fmap_shape: begin
          fmap_shape_read_address <= IN_mem_set_uop.dest_addr;
          fmap_arr_shape_X        <= IN_mem_set_uop.a_value;
          fmap_arr_shape_Y        <= IN_mem_set_uop.b_value;
          fmap_arr_shape_Z        <= IN_mem_set_uop.c_value;
          fmap_write_enable       <= 1'b1;
        end

        data_to_weight_shape: begin
          weight_shape_read_address <= IN_mem_set_uop.dest_addr;
          weight_arr_shape_X        <= IN_mem_set_uop.a_value;
          weight_arr_shape_Y        <= IN_mem_set_uop.b_value;
          weight_arr_shape_Z        <= IN_mem_set_uop.c_value;
          weight_write_enable       <= 1'b1;
        end

        default: ;
      endcase
    end
  end

  fmap_array_shape u_fmap_shape (
      .clk   (clk_core),
      .rst_n (rst_n_core),
      .wr_en (fmap_write_enable),
      .wr_addr(fmap_shape_read_address),
      .wr_val0(fmap_arr_shape_X),
      .wr_val1(fmap_arr_shape_Y),
      .wr_val2(fmap_arr_shape_Z),
      .rd_addr(fmap_shape_read_address),
      .rd_val0(fmap_read_arr_shape_X),
      .rd_val1(fmap_read_arr_shape_Y),
      .rd_val2(fmap_read_arr_shape_Z)
  );

  weight_array_shape u_weight_shape (
      .clk   (clk_core),
      .rst_n (rst_n_core),
      .wr_en (weight_write_enable),
      .wr_addr(weight_shape_read_address),
      .wr_val0(weight_arr_shape_X),
      .wr_val1(weight_arr_shape_Y),
      .wr_val2(weight_arr_shape_Z),
      .rd_addr(weight_shape_read_address),
      .rd_val0(weight_read_arr_shape_X),
      .rd_val1(weight_read_arr_shape_Y),
      .rd_val2(weight_read_arr_shape_Z)
  );

  // ===| Shape totals (word counts for DMA) |====================================
  logic [16:0] fmap_word_total;
  logic [16:0] weight_word_total;

  // Total BF16 elements → 128-bit words: ceil(X*Y*Z / 8)
  assign fmap_word_total   = (fmap_read_arr_shape_X   * fmap_read_arr_shape_Y   * fmap_read_arr_shape_Z   + 7) >> 3;
  assign weight_word_total = (weight_read_arr_shape_X * weight_read_arr_shape_Y * weight_read_arr_shape_Z + 7) >> 3;

  // ===| LOAD uop → ACP / NPU command translation |==============================
  logic    IN_acp_rdy;
  acp_uop_t acp_uop;
  logic    acp_rx_start;

  logic    IN_npu_rdy;
  npu_uop_t npu_uop;
  logic    npu_rx_start;

  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      acp_rx_start <= 1'b0;
      npu_rx_start <= 1'b0;
      IN_acp_rdy   <= 1'b0;
      IN_npu_rdy   <= 1'b0;
    end else begin
      acp_rx_start <= 1'b0;
      npu_rx_start <= 1'b0;
      IN_acp_rdy   <= 1'b0;
      IN_npu_rdy   <= 1'b0;

      case (IN_LOAD_uop.data_dest)
        // Host DDR4 → L2 (feature map DMA in)
        from_host_to_L2: begin
          acp_uop <= '{
              write_en  : `PORT_MOD_E_WRITE,
              base_addr : IN_LOAD_uop.dest_addr,
              end_addr  : IN_LOAD_uop.dest_addr + 17'(fmap_word_total)
          };
          acp_rx_start <= 1'b1;
          IN_acp_rdy   <= 1'b1;
          fmap_shape_read_address <= IN_LOAD_uop.shape_ptr_addr;
        end

        // L2 → host DDR4 (result DMA out)
        from_L2_to_host: begin
          acp_uop <= '{
              write_en  : `PORT_MOD_E_READ,
              base_addr : IN_LOAD_uop.src_addr,
              end_addr  : IN_LOAD_uop.src_addr + 17'(fmap_word_total)
          };
          acp_rx_start <= 1'b1;
          IN_acp_rdy   <= 1'b1;
          fmap_shape_read_address <= IN_LOAD_uop.shape_ptr_addr;
        end

        // L2 → GEMM fmap broadcast
        from_L2_to_L1_GEMM: begin
          npu_uop <= '{
              write_en  : `PORT_MOD_E_READ,
              base_addr : IN_LOAD_uop.src_addr,
              end_addr  : IN_LOAD_uop.src_addr + 17'(fmap_word_total)
          };
          npu_rx_start <= 1'b1;
          IN_npu_rdy   <= 1'b1;
          fmap_shape_read_address <= IN_LOAD_uop.shape_ptr_addr;
        end

        // L2 → GEMV fmap broadcast
        from_L2_to_L1_GEMV: begin
          npu_uop <= '{
              write_en  : `PORT_MOD_E_READ,
              base_addr : IN_LOAD_uop.src_addr,
              end_addr  : IN_LOAD_uop.src_addr + 17'(fmap_word_total)
          };
          npu_rx_start <= 1'b1;
          IN_npu_rdy   <= 1'b1;
          fmap_shape_read_address <= IN_LOAD_uop.shape_ptr_addr;
        end

        // L2 → CVO input (handled by mem_CVO_stream_bridge below)
        from_L2_to_CVO: ;  // bridge watches IN_CVO_uop directly

        default: ;
      endcase
    end
  end

  // ===| Operation queues |======================================================
  acp_uop_t OUT_acp_cmd;
  npu_uop_t OUT_npu_cmd;
  logic     OUT_acp_cmd_valid;
  logic     OUT_npu_cmd_valid;
  logic     acp_is_busy_wire;
  logic     npu_is_busy_wire;

  mem_u_operation_queue #() u_op_queue (
      .clk_core             (clk_core),
      .rst_n_core           (rst_n_core),
      .IN_acp_rdy           (IN_acp_rdy),
      .IN_acp_cmd           (acp_uop),
      .OUT_acp_cmd          (OUT_acp_cmd),
      .OUT_acp_cmd_valid    (OUT_acp_cmd_valid),
      .OUT_acp_cmd_fifo_full(acp_cmd_fifo_full),
      .IN_acp_is_busy       (acp_is_busy_wire),
      .IN_npu_rdy           (IN_npu_rdy),
      .IN_npu_cmd           (npu_uop),
      .OUT_npu_cmd          (OUT_npu_cmd),
      .OUT_npu_cmd_valid    (OUT_npu_cmd_valid),
      .OUT_npu_cmd_fifo_full(npu_cmd_fifo_full),
      .IN_npu_is_busy       (npu_is_busy_wire)
  );

  // ===| L2 cache controller |===================================================
  // CVO bridge drives L2 port B when active; otherwise port B is driven by
  // the NPU DMA state machine in mem_GLOBAL_cache.
  logic        cvo_l2_we;
  logic [16:0] cvo_l2_addr;
  logic [127:0] cvo_l2_wdata;
  logic [127:0] cvo_l2_rdata;

  logic        npu_l2_we;
  logic [16:0] npu_l2_addr;
  logic [127:0] npu_l2_wdata;
  logic [127:0] npu_l2_rdata;

  // Port B arbitration: CVO bridge wins when busy
  logic        final_npu_we;
  logic [16:0] final_npu_addr;
  logic [127:0] final_npu_wdata;

  always_comb begin
    if (cvo_bridge_busy) begin
      final_npu_we    = cvo_l2_we;
      final_npu_addr  = cvo_l2_addr;
      final_npu_wdata = cvo_l2_wdata;
    end else begin
      final_npu_we    = npu_l2_we;
      final_npu_addr  = npu_l2_addr;
      final_npu_wdata = npu_l2_wdata;
    end
  end

  // Route L2 rdata to the appropriate consumer
  assign cvo_l2_rdata = npu_l2_rdata;  // shared read bus

  mem_GLOBAL_cache #() u_l2_cache (
      .clk_core         (clk_core),
      .rst_n_core       (rst_n_core),
      .clk_axi          (clk_axi),
      .rst_axi_n        (rst_axi_n),

      .S_AXIS_ACP_FMAP  (S_AXIS_ACP_FMAP),
      .M_AXIS_ACP_RESULT(M_AXIS_ACP_RESULT),

      // ACP control
      .IN_acp_write_en  (OUT_acp_cmd.write_en),
      .IN_acp_base_addr (OUT_acp_cmd.base_addr),
      .IN_acp_end_addr  (OUT_acp_cmd.end_addr),
      .IN_acp_rx_start  (OUT_acp_cmd_valid),
      .OUT_acp_is_busy  (acp_is_busy_wire),

      // NPU port B (CVO bridge or DMA state machine)
      .IN_npu_write_en  (OUT_npu_cmd.write_en),
      .IN_npu_base_addr (OUT_npu_cmd.base_addr),
      .IN_npu_end_addr  (OUT_npu_cmd.end_addr),
      .IN_npu_rx_start  (OUT_npu_cmd_valid),
      .OUT_npu_is_busy  (npu_is_busy_wire),

      .IN_npu_wdata     (final_npu_wdata),
      .OUT_npu_rdata    (npu_l2_rdata)
  );

  // ===| CVO Stream Bridge |=====================================================
  logic cvo_bridge_done;

  mem_CVO_stream_bridge u_cvo_bridge (
      .clk                (clk_core),
      .rst_n              (rst_n_core),

      .IN_cvo_uop         (IN_CVO_uop),
      .IN_cvo_uop_valid   (IN_cvo_uop_valid),
      .OUT_busy           (cvo_bridge_busy),
      .OUT_done           (cvo_bridge_done),

      // L2 port B direct access
      .OUT_l2_we          (cvo_l2_we),
      .OUT_l2_addr        (cvo_l2_addr),
      .OUT_l2_wdata       (cvo_l2_wdata),
      .IN_l2_rdata        (cvo_l2_rdata),

      // CVO data stream
      .OUT_cvo_data       (OUT_cvo_data),
      .OUT_cvo_valid      (OUT_cvo_valid),
      .IN_cvo_data_ready  (IN_cvo_data_ready),

      // CVO result stream
      .IN_cvo_result        (IN_cvo_result),
      .IN_cvo_result_valid  (IN_cvo_result_valid),
      .OUT_cvo_result_ready (OUT_cvo_result_ready)
  );

endmodule

mem_L2_cache_fmap.sv

`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"

// ===| L2 Feature Map & KV Cache (URAM) |========================================
// True dual-port URAM: Depth x 128-bit wide.
//   Default Depth = 114688 entries = 1.75 MB
//
// Port A — ACP DMA path  (host DDR4 ↔ L2 via ACP)
// Port B — NPU compute   (GEMM / GEMV / CVO streaming R/W)
//
// READ_LATENCY = 3 (URAM registered output, meets 400 MHz timing)
// WRITE_MODE   = write_first (read-before-write on same address is undefined)
// ===============================================================================

module mem_L2_cache_fmap #(
    parameter int Depth = 114688   // 128-bit word entries (1.75 MB)
) (
    input  logic        clk_core,
    input  logic        rst_n_core,

    // ===| Port A — ACP host DMA |================================================
    input  logic         IN_acp_we,
    input  logic  [16:0] IN_acp_addr,
    input  logic [127:0] IN_acp_wdata,
    output logic [127:0] OUT_acp_rdata,

    // ===| Port B — NPU compute engines |=========================================
    input  logic         IN_npu_we,
    input  logic  [16:0] IN_npu_addr,
    input  logic [127:0] IN_npu_wdata,
    output logic [127:0] OUT_npu_rdata
);

  xpm_memory_tdpram #(
      // ===| Geometry |===
      .ADDR_WIDTH_A       (17),
      .ADDR_WIDTH_B       (17),
      .DATA_WIDTH_A       (128),
      .DATA_WIDTH_B       (128),
      .BYTE_WRITE_WIDTH_A (128),
      .BYTE_WRITE_WIDTH_B (128),
      .MEMORY_SIZE        (128 * Depth),

      // ===| Implementation |===
      .MEMORY_PRIMITIVE   ("ultra"),      // Force URAM on UltraScale+
      .CLOCKING_MODE      ("common_clock"),
      .READ_LATENCY_A     (3),
      .READ_LATENCY_B     (3),
      .WRITE_MODE_A       ("write_first"),
      .WRITE_MODE_B       ("write_first"),

      // ===| Init / Misc |===
      .MEMORY_INIT_FILE   ("none"),
      .MEMORY_INIT_PARAM  ("0"),
      .USE_MEM_INIT       (0),
      .AUTO_SLEEP_TIME    (0),
      .WAKEUP_TIME        ("disable_sleep"),
      .ECC_MODE           ("no_ecc"),
      .USE_EMBEDDED_CONSTRAINT(0)
  ) u_l2_uram (
      // Port A
      .clka           (clk_core),
      .rsta           (~rst_n_core),
      .ena            (1'b1),
      .wea            (IN_acp_we),
      .addra          (IN_acp_addr),
      .dina           (IN_acp_wdata),
      .douta          (OUT_acp_rdata),
      .regcea         (1'b1),
      .injectsbiterra (1'b0),
      .injectdbiterra (1'b0),
      .sbiterra       (),
      .dbiterra       (),

      // Port B
      .clkb           (clk_core),
      .rstb           (~rst_n_core),
      .enb            (1'b1),
      .web            (IN_npu_we),
      .addrb          (IN_npu_addr),
      .dinb           (IN_npu_wdata),
      .doutb          (OUT_npu_rdata),
      .regceb         (1'b1),
      .injectsbiterrb (1'b0),
      .injectdbiterrb (1'b0),
      .sbiterrb       (),
      .dbiterrb       ()
  );

endmodule

mem_HP_buffer.sv

`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"

module mem_HP_buffer #(
) (
    // ===| Clock & Reset |======================================
    input logic clk_core,  // 400MHz
    input logic rst_n_core,
    input logic clk_axi,  // 250MHz
    input logic rst_axi_n,

    // ===| HP Ports (Weight) - AXI Side |=======================
    axis_if.slave S_AXI_HP0_WEIGHT,
    axis_if.slave S_AXI_HP1_WEIGHT,
    axis_if.slave S_AXI_HP2_WEIGHT,
    axis_if.slave S_AXI_HP3_WEIGHT,

    // ===| Weight Stream - Core Side (To L1 or Dispatcher) |====
    axis_if.master M_CORE_HP0_WEIGHT,
    axis_if.master M_CORE_HP1_WEIGHT,
    axis_if.master M_CORE_HP2_WEIGHT,
    axis_if.master M_CORE_HP3_WEIGHT
);

  // ine Large Depth for URAM (4096 uses 2 URAM blocks per FIFO)
  localparam int URAM_FIFO_DEPTH = 4096;

  // [1] HP0 Weight FIFO (URAM based - Massive 64KB)
  xpm_fifo_axis #(
      .FIFO_DEPTH      (URAM_FIFO_DEPTH),
      .TDATA_WIDTH     (128),
      .FIFO_MEMORY_TYPE("ultra"),             // Forces UltraRAM
      .CLOCKING_MODE   ("independent_clock")
  ) u_hp0_weight_fifo (
      .s_aclk(clk_axi),
      .s_aresetn(rst_axi_n),
      .s_axis_tdata(S_AXI_HP0_WEIGHT.tdata),
      .s_axis_tvalid(S_AXI_HP0_WEIGHT.tvalid),
      .s_axis_tready(S_AXI_HP0_WEIGHT.tready),

      .m_aclk(clk_core),
      .m_axis_tdata(M_CORE_HP0_WEIGHT.tdata),
      .m_axis_tvalid(M_CORE_HP0_WEIGHT.tvalid),
      .m_axis_tready(M_CORE_HP0_WEIGHT.tready)
  );

  // [2] HP1 Weight FIFO (URAM based - Massive 64KB)
  xpm_fifo_axis #(
      .FIFO_DEPTH(URAM_FIFO_DEPTH),
      .TDATA_WIDTH(128),
      .FIFO_MEMORY_TYPE("ultra"),
      .CLOCKING_MODE("independent_clock")
  ) u_hp1_weight_fifo (
      .s_aclk(clk_axi),
      .s_aresetn(rst_axi_n),
      .s_axis_tdata(S_AXI_HP1_WEIGHT.tdata),
      .s_axis_tvalid(S_AXI_HP1_WEIGHT.tvalid),
      .s_axis_tready(S_AXI_HP1_WEIGHT.tready),

      .m_aclk(clk_core),
      .m_axis_tdata(M_CORE_HP1_WEIGHT.tdata),
      .m_axis_tvalid(M_CORE_HP1_WEIGHT.tvalid),
      .m_axis_tready(M_CORE_HP1_WEIGHT.tready)
  );

  // [3] HP2 Weight FIFO (URAM based - Massive 64KB)
  xpm_fifo_axis #(
      .FIFO_DEPTH(URAM_FIFO_DEPTH),
      .TDATA_WIDTH(128),
      .FIFO_MEMORY_TYPE("ultra"),
      .CLOCKING_MODE("independent_clock")
  ) u_hp2_weight_fifo (
      .s_aclk(clk_axi),
      .s_aresetn(rst_axi_n),
      .s_axis_tdata(S_AXI_HP2_WEIGHT.tdata),
      .s_axis_tvalid(S_AXI_HP2_WEIGHT.tvalid),
      .s_axis_tready(S_AXI_HP2_WEIGHT.tready),

      .m_aclk(clk_core),
      .m_axis_tdata(M_CORE_HP2_WEIGHT.tdata),
      .m_axis_tvalid(M_CORE_HP2_WEIGHT.tvalid),
      .m_axis_tready(M_CORE_HP2_WEIGHT.tready)
  );

  // [4] HP3 Weight FIFO (URAM based - Massive 64KB)
  xpm_fifo_axis #(
      .FIFO_DEPTH(URAM_FIFO_DEPTH),
      .TDATA_WIDTH(128),
      .FIFO_MEMORY_TYPE("ultra"),
      .CLOCKING_MODE("independent_clock")
  ) u_hp3_weight_fifo (
      .s_aclk(clk_axi),
      .s_aresetn(rst_axi_n),
      .s_axis_tdata(S_AXI_HP3_WEIGHT.tdata),
      .s_axis_tvalid(S_AXI_HP3_WEIGHT.tvalid),
      .s_axis_tready(S_AXI_HP3_WEIGHT.tready),

      .m_aclk(clk_core),
      .m_axis_tdata(M_CORE_HP3_WEIGHT.tdata),
      .m_axis_tvalid(M_CORE_HP3_WEIGHT.tvalid),
      .m_axis_tready(M_CORE_HP3_WEIGHT.tready)
  );

endmodule

mem_CVO_stream_bridge.sv

`timescale 1ns / 1ps
`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| CVO L2 Stream Bridge |====================================================
// Bridges the 128-bit L2 port B to the 16-bit BF16 streaming interface of CVO_top.
//
// Operation flow:
//   Phase 1 — READ : sequential 128-bit bursts from L2[src_addr..src_addr+N_words-1]
//                    → deserialise 8 x 16-bit per word → stream to CVO engine.
//                    CVO results are buffered in an internal XPM FIFO.
//   Phase 2 — WRITE: drain FIFO → serialise 8 x 16-bit → 128-bit bursts
//                    → write to L2[dst_addr..dst_addr+N_words-1].
//
// L2 address unit  : 128-bit words.  base_addr N ↔ bytes [16*N .. 16*N+15].
// L2 read latency  : 3 clocks (URAM READ_LATENCY_B = 3).
// Max vector length: 2048 elements (16-bit each = 32 KB → fits in 1 BRAM36).
// ===============================================================================

module mem_CVO_stream_bridge (
    input logic clk,
    input logic rst_n,

    // ===| Dispatch from mem_dispatcher |========================================
    input  cvo_control_uop_t IN_cvo_uop,
    input  logic             IN_cvo_uop_valid,
    output logic             OUT_busy,
    output logic             OUT_done,

    // ===| L2 port B direct interface (128-bit) |================================
    // Single-address mux: write takes priority over read.
    output logic         OUT_l2_we,
    output logic [ 16:0] OUT_l2_addr,
    output logic [127:0] OUT_l2_wdata,
    input  logic [127:0] IN_l2_rdata,   // valid 3 cycles after OUT_l2_addr+~we

    // ===| CVO data stream (to CVO_top.IN_data) |=================================
    output logic [15:0] OUT_cvo_data,
    output logic        OUT_cvo_valid,
    input  logic        IN_cvo_data_ready,

    // ===| CVO result stream (from CVO_top.OUT_result) |==========================
    input  logic [15:0] IN_cvo_result,
    input  logic        IN_cvo_result_valid,
    output logic        OUT_cvo_result_ready
);

  // ===| State Machine |=========================================================
  typedef enum logic [1:0] {
    ST_IDLE  = 2'b00,
    ST_READ  = 2'b01,  // reading L2 → CVO (buffering outputs)
    ST_WRITE = 2'b10,  // draining buffer → L2
    ST_DONE  = 2'b11
  } bridge_state_e;

  bridge_state_e state;

  // ===| Latched UOP |===========================================================
  logic [16:0] rd_base;     // L2 word address of src
  logic [16:0] wr_base;     // L2 word address of dst
  logic [15:0] total_elems; // CVO length (elements)
  logic [12:0] total_words; // ceil(total_elems / 8)

  always_comb begin
    total_words = 13'((total_elems + 16'd7) >> 3);
  end

  // ===| Read-side state |=======================================================
  logic [ 12:0] rd_word_cnt;  // words issued so far
  logic [  2:0] rd_elem_idx;  // current element within 128-bit deser buffer
  logic [127:0] rd_deser_buf;  // latched 128-bit L2 word
  logic         rd_buf_valid;  // deser buffer holds valid data
  logic [ 15:0] elems_fed;  // elements delivered to CVO

  // 3-cycle read latency tracking
  logic [  2:0] rd_lat_pipe;  // shift register: [2]=oldest, [0]=newest

  // ===| Write-side state |======================================================
  logic [  2:0] wr_elem_idx;  // accumulation index 0..7
  logic [127:0] wr_ser_buf;  // serialisation buffer
  logic [ 12:0] wr_word_cnt;  // words written so far
  logic [ 15:0] elems_result;  // results drained from FIFO

  // ===| Output FIFO (CVO results → write buffer) |==============================
  // XPM FIFO sync, depth=2048, width=16 bit (max 32 KB = 1 BRAM36)
  logic         fifo_wr_en;
  logic         fifo_rd_en;
  logic [ 15:0] fifo_dout;
  logic         fifo_empty;
  logic         fifo_full;

  assign fifo_wr_en = IN_cvo_result_valid && (state == ST_READ);
  assign OUT_cvo_result_ready = ~fifo_full && (state == ST_READ);

  xpm_fifo_sync #(
      .FIFO_DEPTH      (2048),
      .WRITE_DATA_WIDTH(16),
      .READ_DATA_WIDTH (16),
      .FIFO_MEMORY_TYPE("block"),
      .READ_MODE       ("std"),
      .FULL_RESET_VALUE(0)
  ) u_result_fifo (
      .sleep (1'b0),
      .rst   (~rst_n),
      .wr_clk(clk),
      .rd_clk(clk),
      .wr_en (fifo_wr_en),
      .din   (IN_cvo_result),
      .rd_en (fifo_rd_en),
      .dout  (fifo_dout),
      .empty (fifo_empty),
      .full  (fifo_full)
  );

  // ===| Main FSM |==============================================================
  always_ff @(posedge clk) begin
    if (!rst_n) begin
      state        <= ST_IDLE;
      rd_base      <= '0;
      wr_base      <= '0;
      total_elems  <= '0;
      rd_word_cnt  <= '0;
      rd_elem_idx  <= '0;
      rd_deser_buf <= '0;
      rd_buf_valid <= 1'b0;
      rd_lat_pipe  <= 3'b0;
      elems_fed    <= '0;
      wr_elem_idx  <= '0;
      wr_ser_buf   <= '0;
      wr_word_cnt  <= '0;
      elems_result <= '0;
      OUT_done     <= 1'b0;
    end else begin
      OUT_done <= 1'b0;

      case (state)
        // ===| IDLE: latch uop, convert element addresses to word addresses |===
        ST_IDLE: begin
          if (IN_cvo_uop_valid) begin
            // src/dst are element (16-bit) addresses; divide by 8 for 128-bit words
            rd_base      <= 17'(IN_cvo_uop.src_addr >> 3);
            wr_base      <= 17'(IN_cvo_uop.dst_addr >> 3);
            total_elems  <= IN_cvo_uop.length;
            rd_word_cnt  <= '0;
            rd_elem_idx  <= '0;
            rd_buf_valid <= 1'b0;
            rd_lat_pipe  <= 3'b0;
            elems_fed    <= '0;
            wr_elem_idx  <= '0;
            wr_ser_buf   <= '0;
            wr_word_cnt  <= '0;
            elems_result <= '0;
            state        <= ST_READ;
          end
        end

        // ===| READ: stream L2 → CVO; capture results in FIFO |================
        ST_READ: begin
          // Advance latency shift register
          rd_lat_pipe <= {rd_lat_pipe[1:0], 1'b0};

          // Issue next L2 read when deser buffer is empty (pre-fetch when 3 left)
          if (!rd_buf_valid && rd_word_cnt < 13'(total_words)) begin
            rd_lat_pipe[0] <= 1'b1;  // mark new read outstanding
            rd_word_cnt    <= rd_word_cnt + 13'd1;
          end

          // Capture L2 data 3 cycles after read issued
          if (rd_lat_pipe[2]) begin
            rd_deser_buf <= IN_l2_rdata;
            rd_buf_valid <= 1'b1;
            rd_elem_idx  <= 3'd0;
          end

          // Feed CVO one element per cycle from deser buffer
          if (rd_buf_valid && IN_cvo_data_ready) begin
            rd_elem_idx <= rd_elem_idx + 3'd1;
            elems_fed   <= elems_fed + 16'd1;
            if (rd_elem_idx == 3'd7 || elems_fed + 16'd1 == total_elems) begin
              rd_buf_valid <= 1'b0;
            end
          end

          // Transition when all elements have been fed and all results captured
          if (elems_fed == total_elems && !fifo_empty) begin
            state <= ST_WRITE;
          end
        end

        // ===| WRITE: drain FIFO → L2 |=========================================
        ST_WRITE: begin
          if (!fifo_empty) begin
            wr_ser_buf   <= {fifo_dout, wr_ser_buf[127:16]};
            wr_elem_idx  <= wr_elem_idx + 3'd1;
            elems_result <= elems_result + 16'd1;
          end

          // When 8 elements accumulated (or last partial word), write to L2
          if (wr_elem_idx == 3'd7 || elems_result == total_elems) begin
            wr_word_cnt <= wr_word_cnt + 13'd1;
            wr_elem_idx <= 3'd0;
          end

          if (elems_result == total_elems && fifo_empty) begin
            state <= ST_DONE;
          end
        end

        // ===| DONE: pulse, return to IDLE |====================================
        ST_DONE: begin
          OUT_done <= 1'b1;
          state    <= ST_IDLE;
        end

        default: state <= ST_IDLE;
      endcase
    end
  end

  // ===| FIFO read enable (draining during WRITE phase) |========================
  assign fifo_rd_en = (state == ST_WRITE) && !fifo_empty;

  // ===| L2 port B output mux ===================================================
  // Priority: write (WRITE phase) > read (READ phase)
  always_comb begin
    OUT_l2_we    = 1'b0;
    OUT_l2_addr  = '0;
    OUT_l2_wdata = '0;

    if (state == ST_WRITE && wr_elem_idx == 3'd0 && wr_word_cnt > 0) begin
      // Write accumulated 128-bit word to dst
      OUT_l2_we    = 1'b1;
      OUT_l2_addr  = 17'(wr_base + (wr_word_cnt - 13'd1));
      OUT_l2_wdata = wr_ser_buf;
    end else if (state == ST_READ && rd_lat_pipe[0]) begin
      // Issue read for next 128-bit word from src
      OUT_l2_we   = 1'b0;
      OUT_l2_addr = 17'(rd_base + (rd_word_cnt - 13'd1));
    end
  end

  // ===| CVO data output ========================================================
  // Mux the correct 16-bit slice from the deser buffer
  always_comb begin
    OUT_cvo_data  = rd_deser_buf[rd_elem_idx*16+:16];
    OUT_cvo_valid = rd_buf_valid && (state == ST_READ);
  end

  // ===| Status |================================================================
  assign OUT_busy = (state != ST_IDLE);

endmodule

Memory modules¶

mem_GLOBAL_cache.sv — parameterized global cache block used as the physical backing for the L2 URAM.
mem_BUFFER.sv — generic ping-pong buffer used by the HP buffer and CVO bridge.

mem_GLOBAL_cache.sv

`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"
`include "mem_IO.svh"

import isa_pkg::*;

// ===| Global L2 Cache Controller |==============================================
// Wraps mem_L2_cache_fmap (URAM) and provides two access controllers:
//
//   Port A — ACP DMA  : host DDR4 ↔ L2 via AXI-Stream (with CDC FIFO via mem_BUFFER)
//   Port B — NPU      : compute engines (GEMM / GEMV / CVO) streaming R/W
//
// Arbitration: Port B is driven externally via IN_npu_* signals.
// Address unit: 128-bit words (address 0 = first 128-bit line).
// ===============================================================================

module mem_GLOBAL_cache (
    input logic clk_core,
    input logic rst_n_core,
    input logic clk_axi,
    input logic rst_axi_n,

    // ===| AXI-Stream ACP (external, AXI clock domain) |========================
    axis_if.slave  S_AXIS_ACP_FMAP,    // feature map in  (128-bit, from PS/DDR4)
    axis_if.master M_AXIS_ACP_RESULT,  // result out       (128-bit, to PS/DDR4)

    // ===| Port A — ACP DMA control |============================================
    input  logic        IN_acp_write_en,   // 1=write (DDR→L2), 0=read (L2→DDR)
    input  logic [16:0] IN_acp_base_addr,
    input  logic        IN_acp_rx_start,   // start ACP transfer
    input  logic [16:0] IN_acp_end_addr,
    output logic        OUT_acp_is_busy,

    // ===| Port B — NPU compute direct access |==================================
    input  logic         IN_npu_write_en,
    input  logic  [16:0] IN_npu_base_addr,
    input  logic         IN_npu_rx_start,
    input  logic  [16:0] IN_npu_end_addr,
    output logic         OUT_npu_is_busy,

    input  logic [127:0] IN_npu_wdata,
    output logic [127:0] OUT_npu_rdata
);

  // ===| ACP CDC FIFO (AXI → Core clock) |=======================================
  axis_if #(.DATA_WIDTH(128)) core_acp_rx_bus ();
  axis_if #(.DATA_WIDTH(128)) core_acp_tx_bus ();

  mem_BUFFER u_acp_cdc (
      .clk_core         (clk_core),
      .rst_n_core       (rst_n_core),
      .clk_axi          (clk_axi),
      .rst_axi_n        (rst_axi_n),
      .S_AXIS_ACP_FMAP  (S_AXIS_ACP_FMAP),
      .M_AXIS_ACP_RESULT(M_AXIS_ACP_RESULT),
      .M_CORE_ACP_RX    (core_acp_rx_bus),
      .S_CORE_ACP_TX    (core_acp_tx_bus)
  );

  // ===| Port A — ACP state machine (core clock domain) |=======================
  logic [16:0] acp_ptr;
  logic        acp_write_en;
  logic        acp_is_busy;
  logic [16:0] acp_end_addr;

  assign OUT_acp_is_busy = acp_is_busy;

  // ACP read pipeline: URAM READ_LATENCY=3
  logic [2:0] acp_rd_valid_pipe;

  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      acp_rd_valid_pipe <= 3'b000;
    end else begin
      acp_rd_valid_pipe <= {acp_rd_valid_pipe[1:0], (acp_is_busy & ~acp_write_en)};
    end
  end

  assign core_acp_tx_bus.tvalid = acp_rd_valid_pipe[2];
  assign core_acp_tx_bus.tkeep  = '1;
  assign core_acp_tx_bus.tlast  = 1'b0;

  assign core_acp_rx_bus.tready = acp_is_busy & acp_write_en;

  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      acp_ptr      <= '0;
      acp_end_addr <= '0;
      acp_is_busy  <= 1'b0;
      acp_write_en <= 1'b0;
    end else begin
      if (acp_is_busy) begin
        if (acp_write_en) begin
          if (core_acp_rx_bus.tvalid) begin
            acp_ptr <= acp_ptr + 17'd1;
            if (acp_ptr + 17'd1 >= acp_end_addr) acp_is_busy <= 1'b0;
          end
        end else begin
          if (core_acp_tx_bus.tready) begin
            acp_ptr <= acp_ptr + 17'd1;
            if (acp_ptr + 17'd1 >= acp_end_addr) acp_is_busy <= 1'b0;
          end
        end
      end else if (IN_acp_rx_start) begin
        acp_ptr      <= IN_acp_base_addr;
        acp_end_addr <= IN_acp_end_addr;
        acp_is_busy  <= 1'b1;
        acp_write_en <= IN_acp_write_en;
      end
    end
  end

  // ===| Port B — NPU state machine |============================================
  logic [16:0] npu_ptr;
  logic        npu_write_en;
  logic        npu_is_busy;
  logic [16:0] npu_end_addr;

  assign OUT_npu_is_busy = npu_is_busy;

  always_ff @(posedge clk_core) begin
    if (!rst_n_core) begin
      npu_ptr      <= '0;
      npu_end_addr <= '0;
      npu_is_busy  <= 1'b0;
      npu_write_en <= 1'b0;
    end else begin
      if (npu_is_busy) begin
        npu_ptr <= npu_ptr + 17'd1;
        if (npu_ptr + 17'd1 >= npu_end_addr) npu_is_busy <= 1'b0;
      end else if (IN_npu_rx_start) begin
        npu_ptr      <= IN_npu_base_addr;
        npu_end_addr <= IN_npu_end_addr;
        npu_is_busy  <= 1'b1;
        npu_write_en <= IN_npu_write_en;
      end
    end
  end

  // ===| L2 URAM (port B shared between ACP read-out and NPU compute) |=========
  // Port A → ACP DMA (write when host→L2, read when L2→host)
  // Port B → NPU compute (fmap broadcast, CVO streaming)
  mem_L2_cache_fmap #(
      .Depth(114688)
  ) u_l2_uram (
      .clk_core    (clk_core),
      .rst_n_core  (rst_n_core),

      // Port A — ACP
      .IN_acp_we   (acp_write_en & core_acp_rx_bus.tvalid),
      .IN_acp_addr (acp_ptr),
      .IN_acp_wdata(core_acp_rx_bus.tdata),
      .OUT_acp_rdata(core_acp_tx_bus.tdata),

      // Port B — NPU compute
      .IN_npu_we    (npu_write_en & npu_is_busy),
      .IN_npu_addr  (npu_ptr),
      .IN_npu_wdata (IN_npu_wdata),
      .OUT_npu_rdata(OUT_npu_rdata)
  );

endmodule

Constant memory (shape / size)¶

fmap_array_shape.sv — small constant-memory array holding feature map shape descriptors referenced by shape_ptr_addr.
weight_array_shape.sv — same structure, for weight shapes.

IO¶

mem_IO.svh — AXI / ACP pin-level types and parameters.
mem_u_operation_queue.sv — micro-op queue between the controller and the memory dispatcher.

mem_u_operation_queue.sv

`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"

import isa_pkg::*;

// ===| Memory Operation Queue |==================================================
// Decouples the scheduler from the L2 cache controller.
// Two independent FIFO channels: ACP (host DMA) and NPU (internal compute).
//
// acp_uop_t / npu_uop_t are both 35-bit packed structs:
//   {write_en[0], base_addr[16:0], end_addr[16:0]} = 1+17+17 = 35 bits
// ===============================================================================

module mem_u_operation_queue #() (
    input logic clk_core,
    input logic rst_n_core,

    // ===| ACP channel |=========================================================
    input  logic     IN_acp_rdy,
    input  acp_uop_t IN_acp_cmd,
    output acp_uop_t OUT_acp_cmd,
    output logic     OUT_acp_cmd_valid,
    output logic     OUT_acp_cmd_fifo_full,
    input  logic     IN_acp_is_busy,

    // ===| NPU internal channel |================================================
    input  logic     IN_npu_rdy,
    input  npu_uop_t IN_npu_cmd,
    output npu_uop_t OUT_npu_cmd,
    output logic     OUT_npu_cmd_valid,
    output logic     OUT_npu_cmd_fifo_full,
    input  logic     IN_npu_is_busy
);

  localparam int UopWidth = 35;  // 1 + 17 + 17 = write_en + base_addr + end_addr

  logic acp_fifo_empty;
  logic acp_fifo_full;
  logic npu_fifo_empty;
  logic npu_fifo_full;

  assign OUT_acp_cmd_fifo_full = acp_fifo_full;
  assign OUT_npu_cmd_fifo_full = npu_fifo_full;

  always_comb begin
    OUT_acp_cmd_valid = ~IN_acp_is_busy & ~acp_fifo_empty;
    OUT_npu_cmd_valid = ~IN_npu_is_busy & ~npu_fifo_empty;
  end

  // ===| ACP FIFO |==============================================================
  xpm_fifo_sync #(
      .FIFO_DEPTH        (128),
      .WRITE_DATA_WIDTH  (UopWidth),
      .READ_DATA_WIDTH   (UopWidth),
      .FIFO_MEMORY_TYPE  ("block"),
      .READ_MODE         ("std"),
      .FULL_RESET_VALUE  (0),
      .PROG_FULL_THRESH  (100)
  ) u_acp_uop_fifo (
      .sleep    (1'b0),
      .rst      (~rst_n_core),
      .wr_clk   (clk_core),
      .wr_en    (IN_acp_rdy & ~acp_fifo_full),
      .din      (IN_acp_cmd),
      .prog_full(acp_fifo_full),
      .rd_en    (~IN_acp_is_busy & ~acp_fifo_empty),
      .dout     (OUT_acp_cmd),
      .empty    (acp_fifo_empty),
      .rd_clk   (clk_core)
  );

  // ===| NPU FIFO |==============================================================
  xpm_fifo_sync #(
      .FIFO_DEPTH        (128),
      .WRITE_DATA_WIDTH  (UopWidth),
      .READ_DATA_WIDTH   (UopWidth),
      .FIFO_MEMORY_TYPE  ("block"),
      .READ_MODE         ("std"),
      .FULL_RESET_VALUE  (0),
      .PROG_FULL_THRESH  (100)
  ) u_npu_uop_fifo (
      .sleep    (1'b0),
      .rst      (~rst_n_core),
      .wr_clk   (clk_core),
      .wr_en    (IN_npu_rdy & ~npu_fifo_full),
      .din      (IN_npu_cmd),
      .prog_full(npu_fifo_full),
      .rd_en    (~IN_npu_is_busy & ~npu_fifo_empty),
      .dout     (OUT_npu_cmd),
      .empty    (npu_fifo_empty),
      .rd_clk   (clk_core)
  );

endmodule