Preprocess

The preprocess stage consumes the raw BF16 feature map arriving from the host via ACP, converts it into the 27-bit fixed-point representation native to the DSP48E2 MACs, and stages it into per-core L1 caches.

See also

pccx: Parallel Compute Core eXecutor

preprocess_fmap box in the top-level diagram.

Modules

  • preprocess_fmap.sv — top-level preprocess wrapper feeding both Matrix and Vector cores.

  • fmap_cache.sv — L1 feature-map cache: one slice per consumer.

  • preprocess_bf16_fixed_pipeline.sv — BF16 → 27-bit fixed-point conversion pipeline with Emax alignment.

Source

preprocess_fmap.sv
`include "GLOBAL_CONST.svh"
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "npu_interfaces.svh"

/**
 * Module: gemm_fmap_preprocessor
 *
 * Role:
 * - Combined 256-bit FMap streaming from HPC0/HPC1.
 * - e_max (Exponent) extraction and caching for BFP.
 * - Mantissa shifting to Fixed-point.
 * - SRAM Caching for broadcasting to multiple compute engines (Branch point).
 */
module preprocess_fmap #(
    parameter fmap_width = `ACP_PORT_IN
) (
    input logic clk,
    input logic rst_n,
    input logic i_clear,

    // AXI4-Stream Interfaces from ACP
    axis_if.slave S_AXIS_ACP_FMAP,  // ACP (128-bit)

    // Control from Brain
    input logic i_rd_start,




    // Output to Branch Engines (Systolic / GEMV / CVO)
    output logic [`FIXED_MANT_WIDTH-1:0] o_fmap_broadcast[0:`ARRAY_SIZE_H-1],
    output logic                         o_fmap_valid,

    output logic [`BF16_EXP_WIDTH-1:0] o_cached_emax[0:`ARRAY_SIZE_H-1]
);

  // ===| Bridge & Alignment: 256-bit Feature Map |=======
  //logic [`ACP_PORT_IN:0] s_axis_fmap_combined_tdata;
  //logic                  s_axis_fmap_combined_tvalid;
  //logic                  s_axis_fmap_combined_tready;

  //assign s_axis_fmap_combined_tdata = S_AXIS_ACP_FMAP.tdata;
  //assign s_axis_fmap_combined_tvalid = S_AXIS_FMAP0.tvalid & S_AXIS_FMAP1.tvalid;

  //assign S_AXIS_FMAP0.tready = s_axis_fmap_combined_tready & S_AXIS_FMAP1.tvalid;
  //assign S_AXIS_FMAP1.tready = s_axis_fmap_combined_tready & S_AXIS_FMAP0.tvalid;

  // 256-bit FIFO for FMap
  logic [fmap_width:0] fmap_fifo_data;
  logic                fmap_fifo_valid;
  logic                fmap_fifo_ready;


  xpm_fifo_axis #(
      .FIFO_DEPTH(`XPM_FIFO_DEPTH),
      .TDATA_WIDTH(256),
      .FIFO_MEMORY_TYPE("block"),
      .CLOCKING_MODE("common_clock")
  ) u_fmap_fifo (
      .s_aclk(clk),
      .m_aclk(clk),
      .s_aresetn(rst_n),
      .s_axis_tdata(S_AXIS_ACP_FMAP.tdata),
      .s_axis_tvalid(S_AXIS_ACP_FMAP.tvalid),
      .s_axis_tready(S_AXIS_ACP_FMAP.tready),
      .m_axis_tdata(fmap_fifo_data),
      .m_axis_tvalid(fmap_fifo_valid),
      .m_axis_tready(fmap_fifo_ready)
  );

  // ===| e_max parsing & cache logic |=======
  logic [`BF16_EXP_WIDTH-1:0] active_emax[0:`ARRAY_SIZE_H-1];
  logic fmap_word_toggle;
  logic emax_group_valid;

  always_ff @(posedge clk) begin
    if (!rst_n || i_clear) begin
      fmap_word_toggle <= 1'b0;
      emax_group_valid <= 1'b0;
    end else if (fmap_fifo_valid && fmap_fifo_ready) begin
      fmap_word_toggle <= ~fmap_word_toggle;
      for (int k = 0; k < 16; k++) begin
        if (fmap_word_toggle == 1'b0) active_emax[k] <= fmap_fifo_data[(k*16)+7+:8];
        else active_emax[k+16] <= fmap_fifo_data[(k*16)+7+:8];
      end
      emax_group_valid <= (fmap_word_toggle == 1'b1);
    end else begin
      emax_group_valid <= 1'b0;
    end
  end

  logic [`BF16_EXP_WIDTH-1:0] emax_cache_mem[0:1023][0:`ARRAY_SIZE_H-1];
  logic [9:0] emax_wr_addr, emax_rd_addr;

  always_ff @(posedge clk) begin
    if (!rst_n || i_clear) begin
      emax_wr_addr <= 0;
    end else if (emax_group_valid) begin
      for (int i = 0; i < `ARRAY_SIZE_H; i++) begin
        emax_cache_mem[emax_wr_addr][i] <= active_emax[i];
      end
      emax_wr_addr <= emax_wr_addr + 1;
    end
  end

  always_ff @(posedge clk) begin
    if (!rst_n || i_clear) begin
      emax_rd_addr <= 0;
    end else if (i_rd_start) begin
      emax_rd_addr <= 0;
    end
    for (int i = 0; i < `ARRAY_SIZE_H; i++) begin
      o_cached_emax[i] <= emax_cache_mem[emax_rd_addr][i];
    end
  end

  // ===| Mantissa Shifter & SRAM Cache |=======
  logic [431:0] fixed_fmap;
  logic         fixed_fmap_valid;
  logic         fmap_shifter_ready;

  preprocess_bf16_fixed_pipeline u_fmap_shifter (
      .clk(clk),
      .rst_n(rst_n),
      .s_axis_tdata(fmap_fifo_data),
      .s_axis_tvalid(fmap_fifo_valid),
      .s_axis_tready(fmap_shifter_ready),
      .m_axis_tdata(fixed_fmap),
      .m_axis_tvalid(fixed_fmap_valid),
      .m_axis_tready(1'b1)
  );
  assign fmap_fifo_ready = fmap_shifter_ready;

  logic [6:0] sram_wr_addr;
  always_ff @(posedge clk) begin
    if (!rst_n || i_clear) sram_wr_addr <= 0;
    else if (fixed_fmap_valid) sram_wr_addr <= sram_wr_addr + 1;
  end


  fmap_cache #(
      .DATA_WIDTH(`FIXED_MANT_WIDTH),
      .WRITE_LANES(16),
      .CACHE_DEPTH(`FMAP_CACHE_DEPTH),
      .LANES(`ARRAY_SIZE_H)
  ) u_fmap_sram (
      .clk(clk),
      .rst_n(rst_n),
      .wr_data(fixed_fmap),
      .wr_valid(fixed_fmap_valid),
      .wr_addr(sram_wr_addr),
      .wr_en(1'b1),
      .rd_start(i_rd_start),
      .rd_data_broadcast(o_fmap_broadcast),
      .rd_valid(o_fmap_valid)
  );

endmodule
fmap_cache.sv
`timescale 1ns / 1ps
`include "GEMM_Array.svh"
`include "GLOBAL_CONST.svh"

/**
 * Module: gemm_fmap_cache
 * Description:
 *   SRAM-based Feature Map Cache for Gemma 3N Decode Phase (GEMV).
 *   - Stores a 1x2048 Feature Map (BF16, converted to 27-bit Mantissas).
 *   - Write Interface: 432-bit (16 x 27-bit) to support high-bandwidth.
 *   - Read Interface: 27-bit (1 word) broadcast to 32 Vertical lanes.
 */
module fmap_cache #(
    parameter DATA_WIDTH  = 27,    // Fixed-point Mantissa width
    parameter WRITE_LANES = 16,    // 16 words per write
    parameter CACHE_DEPTH = 2048,  // Accommodates 1x2048 vector
    parameter LANES       = 32     // Number of vertical lanes to feed
) (
    input logic clk,
    input logic rst_n,

    // ===| Write Interface (From BF16-to-Fixed Shifter) |=======
    input logic [(DATA_WIDTH*WRITE_LANES)-1:0] wr_data,
    input logic                                wr_valid,
    input logic [                         6:0] wr_addr,   // log2(2048/16) = 7 bits
    input logic                                wr_en,

    // ===| Read Interface (To Staggered Delay Line) |=======
    input  logic                  rd_start,                      // Trigger to start broadcasting
    output logic [DATA_WIDTH-1:0] rd_data_broadcast[0:LANES-1],  // 32 identical copies
    output logic                  rd_valid
);
  // ===| SRAM Instantiation (XPM Memory) |=======
  // Using Xilinx Parameterized Macro for True Dual-Port RAM.
  // Port A: Write from Shifter, Port B: Read to Systolic Array.

  // We need to fetch 32 words at once? Or broadcast 1 word to 32 lanes?
  // In GEMV (1x2048 dot 2048x2048):
  // A single element of the Feature Map (e.g., FMap[0]) is broadcasted
  // across all 32 columns, multiplying with Row 0 of the Weights.
  // Next clock, FMap[1] is broadcasted, multiplying with Row 1 of the Weights.
  // Therefore, we read ONE word per clock and fan it out to 32 lanes.
  logic [DATA_WIDTH-1:0] sram_rd_data;
  logic [          10:0] rd_addr;
  logic                  is_reading;

  // True Dual Port RAM inference / XPM macro with Asymmetric Ports
  xpm_memory_sdpram #(
      .ADDR_WIDTH_A           (7),                         // Write: 128 depth
      .ADDR_WIDTH_B           (11),                        // Read: 2048 depth
      .AUTO_SLEEP_TIME        (0),
      .BYTE_WRITE_WIDTH_A     (DATA_WIDTH * WRITE_LANES),  // Full word write
      .CLOCKING_MODE          ("common_clock"),
      .MEMORY_INIT_FILE       ("none"),
      .MEMORY_INIT_PARAM      ("0"),
      .MEMORY_OPTIMIZATION    ("true"),
      .MEMORY_PRIMITIVE       ("block"),                   // Force BRAM usage
      .MEMORY_SIZE            (DATA_WIDTH * CACHE_DEPTH),  // 27 * 2048 = 55296 bits
      .MESSAGE_CONTROL        (0),
      .READ_DATA_WIDTH_B      (DATA_WIDTH),                // Read: 27-bit
      .READ_LATENCY_B         (2),                         // 2-cycle latency for 400MHz
      .USE_EMBEDDED_CONSTRAINT(0),
      .USE_MEM_INIT           (1),
      .WAKEUP_TIME            ("disable_sleep"),
      .WRITE_DATA_WIDTH_A     (DATA_WIDTH * WRITE_LANES),  // Write: 432-bit
      .WRITE_MODE_B           ("read_first")
  ) u_fmap_bram (
      .clka(clk),
      .ena(1'b1),
      .wea(wr_en & wr_valid),
      .addra(wr_addr),
      .dina(wr_data),
      .injectsbiterra(1'b0),
      .injectdbiterra(1'b0),

      .clkb(clk),
      .enb(is_reading),
      .addrb(rd_addr),
      .doutb(sram_rd_data),
      .sbiterrb(),
      .dbiterrb(),
      .sleep(1'b0),
      .rstb(~rst_n),
      .regceb(1'b1)
  );

  // ===| FSM / Read Controller |=======
  // Controls the rd_addr to sweep through the cached 2048 elements
  logic rd_valid_pipe_1, rd_valid_pipe_2;

  always_ff @(posedge clk) begin
    if (!rst_n) begin
      is_reading <= 1'b0;
      rd_addr <= 11'd0;
      rd_valid_pipe_1 <= 1'b0;
      rd_valid_pipe_2 <= 1'b0;
      rd_valid <= 1'b0;
    end else begin
      // Start reading when triggered
      if (rd_start) begin
        is_reading <= 1'b1;
        rd_addr <= 11'd0;
      end  // Increment address while reading
      else if (is_reading) begin
        if (rd_addr == CACHE_DEPTH - 1) begin
          is_reading <= 1'b0;  // Stop after 2048 words
        end else begin
          rd_addr <= rd_addr + 1;
        end
      end

      // Pipeline the valid signal to match BRAM latency
      rd_valid_pipe_1 <= is_reading;
      rd_valid_pipe_2 <= rd_valid_pipe_1;
      rd_valid        <= rd_valid_pipe_2;

      // Broadcast the read data to all 32 lanes
      if (rd_valid_pipe_2) begin
        for (int i = 0; i < LANES; i++) begin
          rd_data_broadcast[i] <= sram_rd_data;
        end
      end
    end
  end

endmodule
preprocess_bf16_fixed_pipeline.sv
`include "GLOBAL_CONST.svh"
`timescale 1ns / 1ps
`include "GEMM_Array.svh"

/**
 * Module: gemm_bf16_fixed_pipeline
 * Description:
 *   High-Throughput 16-Lane Pipelined BF16 to Fixed-point Converter.
 *   - Input: 256-bit (16 x BF16 elements) per clock.
 *   - Block Size: 32 elements (Takes 2 clocks to receive one block).
 *   - Operation:
 *       1. Finds the Global e_max among the 32 elements.
 *       2. Shifts the Mantissas (27-bit) to align with Global e_max.
 *   - Output: 432-bit (16 x 27-bit Mantissas) per clock.
 */
module preprocess_bf16_fixed_pipeline (
    input logic clk,
    input logic rst_n,

    // AXI-Stream Slave (Input from 256-bit FIFO)
    input  logic [255:0] s_axis_tdata,
    input  logic         s_axis_tvalid,
    output logic         s_axis_tready,

    // AXI-Stream Master (Output to SRAM Cache - 16 x 27-bit = 432-bit)
    output logic [431:0] m_axis_tdata,
    output logic         m_axis_tvalid,
    input  logic         m_axis_tready
);

  // ===| Stage 1: Input Buffering & Local Max Exponent |===
  // We need to buffer the first 16 words while waiting for the next 16.
  logic [255:0] buffer_low;
  logic [  7:0] local_max_low;
  logic         first_half_valid;

  logic         phase;  // 0: Expecting Low 16, 1: Expecting High 16

  // Combinational Logic to find the maximum exponent among 16 BF16 elements
  function automatic logic [7:0] find_max_e_16(input logic [255:0] data);
    logic [7:0] max_val = 8'd0;
    for (int i = 0; i < 16; i++) begin
      if (data[(i*16)+7+:8] > max_val) begin
        max_val = data[(i*16)+7+:8];
      end
    end
    return max_val;
  endfunction

  assign s_axis_tready = 1'b1;  // Always ready to sink data in this pipeline design

  // Buffer registers for 32 elements
  logic [255:0] block_data_low;
  logic [255:0] block_data_high;
  logic [  7:0] global_emax;
  logic         block_valid;

  always_ff @(posedge clk) begin
    if (!rst_n) begin
      phase <= 1'b0;
      first_half_valid <= 1'b0;
      block_valid <= 1'b0;
    end else if (s_axis_tvalid) begin
      if (phase == 1'b0) begin
        // Store first 16 words and their max exponent
        buffer_low       <= s_axis_tdata;
        local_max_low    <= find_max_e_16(s_axis_tdata);
        first_half_valid <= 1'b1;
        block_valid      <= 1'b0;
        phase            <= 1'b1;
      end else begin
        // Second 16 words arrived! Combine to form 32-word block.
        block_data_low  <= buffer_low;
        block_data_high <= s_axis_tdata;

        // Compare max of low 16 and high 16 to get GLOBAL e_max
        automatic logic [7:0] local_max_high = find_max_e_16(s_axis_tdata);
        global_emax <= (local_max_low > local_max_high) ? local_max_low : local_max_high;

        block_valid <= 1'b1;
        phase       <= 1'b0;  // Reset for next block
      end
    end else begin
      block_valid <= 1'b0;
    end
  end

  // ===| Stage 2: Parallel Shifting (16 Lanes at a time) |===
  // To save resources, we will shift the 32 elements over 2 clock cycles.
  // Cycle 1: Shift block_data_low
  // Cycle 2: Shift block_data_high

  logic         shift_phase;  // 0: shifting low, 1: shifting high
  logic [255:0] shift_target_data;
  logic [  7:0] shift_target_emax;
  logic         shift_trigger;

  always_ff @(posedge clk) begin
    if (!rst_n) begin
      shift_phase   <= 1'b0;
      shift_trigger <= 1'b0;
    end else begin
      if (block_valid) begin
        // Start shifting process
        shift_phase <= 1'b0;
        shift_target_data <= block_data_low;
        shift_target_emax <= global_emax;
        shift_trigger <= 1'b1;
      end else if (shift_trigger && shift_phase == 1'b0) begin
        // Next cycle, shift the high part
        shift_phase <= 1'b1;
        shift_target_data <= block_data_high;
        // keep shift_target_emax same
        shift_trigger <= 1'b1;
      end else begin
        shift_trigger <= 1'b0;
      end
    end
  end

  // The 16 Parallel Shifters (With Sign & 2's Complement Handling)
  logic [431:0] shifted_mantissas;  // 16 * 27-bit

  genvar i;
  generate
    for (i = 0; i < 16; i++) begin : gen_shifters
      logic [15:0] word;
      logic        sign;
      logic [ 7:0] e_val;
      logic [ 6:0] m_val;
      logic [26:0] base_mant;  // 1(implicit) + 7(m) + 12(pad) = 20 bits base
      logic [26:0] shifted_mant;
      logic [26:0] final_fixed;
      logic [ 7:0] delta_e;

      assign word = shift_target_data[(i*16)+:16];
      assign sign = word[15];
      assign e_val = word[14:7];
      assign m_val = word[6:0];

      // 1. Prepare Magnitude (Add hidden bit)
      // We use a 27-bit container. Hidden bit is at [20].
      assign base_mant = (e_val == 0) ? {7'b0, 8'h0, m_val, 12'b0} : {7'b0, 8'h1, m_val, 12'b0};
      assign delta_e = shift_target_emax - e_val;

      // 2. Align by Shifting Right
      assign shifted_mant = (delta_e >= 27) ? 27'd0 : (base_mant >> delta_e);

      // 3. Convert to 2's Complement if Sign is negative
      // This is CRITICAL for signed multiplication and accumulation in the engines.
      assign final_fixed = sign ? (~shifted_mant + 1'b1) : shifted_mant;

      assign shifted_mantissas[(i*27)+:27] = final_fixed;
    end
  endgenerate


  // ===| Stage 3: Output Register |===
  always_ff @(posedge clk) begin
    if (!rst_n) begin
      m_axis_tvalid <= 1'b0;
      m_axis_tdata  <= 0;
    end else begin
      m_axis_tvalid <= shift_trigger;
      if (shift_trigger) begin
        m_axis_tdata <= shifted_mantissas;
      end
    end
  end

endmodule