Host API (C driver)¶
The host-side C library that builds VLIW instructions and writes them
to the NPU’s AXI-Lite control surface. Split into a thin HAL (MMIO
register accessors + IRQ-free wait_idle polling) and a public API
that mirrors the ISA 1:1.
See also
- Developer Reference for pccx v001 Host API
Human-readable developer reference for the same API.
Public API¶
uCA_v1_api.h— public function prototypes (pccx_init,pccx_gemv,pccx_gemm,pccx_cvo,pccx_memcpy,pccx_memset,pccx_sync).uCA_v1_api.c— implementation: per-opcodebuild_*_instrhelpers that pack arguments into 64-bit VLIWs and hand them to the HAL.
uCA_v1_api.h
// ===| uCA API (High-Level Driver Interface) |====================================
// uCA: micro Compute Architecture — AI model acceleration API for FPGA NPU.
//
// This is the "CUDA equivalent" for the uCA NPU. Application code
// (sw/gemma3NE4B/ and future projects) should call only functions from
// this layer — never touch the HAL directly.
//
// This layer builds 64-bit VLIW instructions per the ISA spec (docs/ISA.md)
// and issues them via the HAL. The NPU frontend is fully decoupled: each
// uca_* call returns immediately after issuing to the instruction FIFO.
// Call uca_sync() to wait for all in-flight operations to complete.
//
// Encoding reference: docs/ISA.md
// ================================================================================
#ifndef UCA_V1_API_H
#define UCA_V1_API_H
#include <stdint.h>
// ===| Opcode Definitions |======================================================
// Must match ISA.md §2 and isa_x64.svh opcode_e
#define UCA_OP_GEMV 0x0
#define UCA_OP_GEMM 0x1
#define UCA_OP_MEMCPY 0x2
#define UCA_OP_MEMSET 0x3
#define UCA_OP_CVO 0x4
// ===| GEMV / GEMM Flags (6-bit) |===============================================
// Must match ISA.md §4 and flags_t in isa_x64.svh
#define UCA_FLAG_FINDEMAX (1U << 5) // Find e_max over output (for softmax)
#define UCA_FLAG_ACCM (1U << 4) // Accumulate into dest (do not overwrite)
#define UCA_FLAG_W_SCALE (1U << 3) // Apply weight scale factor during MAC
// ===| CVO Function Codes (4-bit) |==============================================
// Must match ISA.md §3.4.1 and cvo_func_e in isa_cvo.svh
#define UCA_CVO_EXP 0x0 // Element-wise exp(x) — SFU
#define UCA_CVO_SQRT 0x1 // Element-wise sqrt(x) — SFU
#define UCA_CVO_GELU 0x2 // Element-wise GELU(x) — SFU
#define UCA_CVO_SIN 0x3 // Element-wise sin(x) — CORDIC
#define UCA_CVO_COS 0x4 // Element-wise cos(x) — CORDIC
#define UCA_CVO_REDUCE_SUM 0x5 // Sum reduction → scalar at dst — SFU+Adder
#define UCA_CVO_SCALE 0x6 // Element-wise multiply by scalar — SFU
#define UCA_CVO_RECIP 0x7 // Element-wise 1/x — SFU
// ===| CVO Flags (5-bit) |=======================================================
// Must match ISA.md §3.4.2 and cvo_flags_t in isa_cvo.svh
#define UCA_CVO_FLAG_SUB_EMAX (1U << 4) // Subtract e_max before operation
#define UCA_CVO_FLAG_RECIP_SCALE (1U << 3) // Use 1/scalar for SCALE op
#define UCA_CVO_FLAG_ACCM (1U << 2) // Accumulate into dst
// ===| Memory Route Codes |======================================================
// Must match ISA.md §5 and data_route_e in isa_memctrl.svh
// Upper nibble = from_device, lower nibble = to_device
#define UCA_ROUTE_HOST_TO_L2 0x01
#define UCA_ROUTE_L2_TO_HOST 0x10
#define UCA_ROUTE_L2_TO_L1_GEMM 0x12
#define UCA_ROUTE_L2_TO_L1_GEMV 0x13
#define UCA_ROUTE_GEMV_RES_TO_L2 0x31
#define UCA_ROUTE_GEMM_RES_TO_L2 0x21
#define UCA_ROUTE_CVO_RES_TO_L2 0x41
// ===| API Init |================================================================
int uca_init(void); // Calls uca_hal_init() and verifies NPU is responsive
void uca_deinit(void);
// ===| Compute: Vector Core (GEMV) |=============================================
// Issue a GEMV instruction (INT4 weight × BF16/INT8 activation → BF16 out).
//
// dest_reg : destination register / L2 address (17-bit)
// src_addr : source fmap address (17-bit)
// flags : OR of UCA_FLAG_* constants
// size_ptr : pointer to size descriptor in shape cache (6-bit)
// shape_ptr : pointer to shape descriptor in shape cache (6-bit)
// lanes : number of active parallel μV-Core lanes (5-bit, 1–4)
void uca_gemv(uint32_t dest_reg, uint32_t src_addr,
uint8_t flags, uint8_t size_ptr,
uint8_t shape_ptr, uint8_t lanes);
// ===| Compute: Matrix Core (GEMM) |=============================================
// Issue a GEMM instruction (systolic 32×32 array).
// Same field layout as GEMV; differs only in opcode routing.
void uca_gemm(uint32_t dest_reg, uint32_t src_addr,
uint8_t flags, uint8_t size_ptr,
uint8_t shape_ptr, uint8_t lanes);
// ===| Compute: CVO Core (Complex Vector Operations) |==========================
// Issue a CVO instruction to one of the 2× μCVO-Cores.
// Used for: softmax (EXP, REDUCE_SUM, SCALE), RMSNorm (SQRT, RECIP, SCALE),
// activation functions (GELU), attention (SIN/COS for RoPE).
//
// cvo_func : one of UCA_CVO_* function codes
// src_addr : source address in L2 cache (17-bit)
// dst_addr : destination address in L2 cache (17-bit)
// length : number of elements to process (16-bit)
// flags : OR of UCA_CVO_FLAG_* constants
// async : 0=block until done, 1=fire-and-forget
void uca_cvo(uint8_t cvo_func, uint32_t src_addr,
uint32_t dst_addr, uint16_t length,
uint8_t flags, uint8_t async);
// ===| Memory: MEMCPY |=========================================================
// Issue a DMA transfer between host and NPU memory, or between NPU caches.
//
// route : one of UCA_ROUTE_* constants
// dest_addr : destination address (17-bit)
// src_addr : source address (17-bit)
// shape_ptr : pointer to shape descriptor (6-bit)
// async : 0=blocking, 1=fire-and-forget
void uca_memcpy(uint8_t route, uint32_t dest_addr,
uint32_t src_addr, uint8_t shape_ptr,
uint8_t async);
// ===| Memory: MEMSET |=========================================================
// Set shape descriptor values in the shape cache.
//
// dest_cache : 0=fmap_shape cache, 1=weight_shape cache
// dest_addr : target pointer address in the shape cache (6-bit)
// a, b, c : values to write (16-bit each, typically dimension sizes)
void uca_memset(uint8_t dest_cache, uint8_t dest_addr,
uint16_t a, uint16_t b, uint16_t c);
// ===| Synchronization |=========================================================
// Block until all issued instructions complete (polls UCA_STAT_BUSY).
// Returns 0 on success, -1 on timeout.
int uca_sync(uint32_t timeout_us);
#endif // UCA_V1_API_H
uCA_v1_api.c
// ===| uCA API Implementation |==================================================
// Builds 64-bit VLIW instructions from structured arguments and issues them
// to the NPU via the HAL. Encoding per docs/ISA.md.
// ================================================================================
#include "uCA_v1_api.h"
#include "uCA_v1_hal.h"
// ===| Instruction Builder Helpers |=============================================
// ===| build_compute_instr |===
// Packs GEMV or GEMM instruction into a 64-bit word.
// Layout (ISA.md §3.1):
// [63:60] opcode 4-bit
// [59:43] dest_reg 17-bit
// [42:26] src_addr 17-bit
// [25:20] flags 6-bit
// [19:14] size_ptr 6-bit
// [13:8] shape_ptr 6-bit
// [7:3] lanes 5-bit
// [2:0] reserved 3-bit
static uint64_t build_compute_instr(uint8_t opcode, uint32_t dest_reg,
uint32_t src_addr, uint8_t flags,
uint8_t size_ptr, uint8_t shape_ptr,
uint8_t lanes) {
uint64_t instr = 0;
instr |= ((uint64_t)(opcode & 0xF) << 60);
instr |= ((uint64_t)(dest_reg & 0x1FFFF) << 43);
instr |= ((uint64_t)(src_addr & 0x1FFFF) << 26);
instr |= ((uint64_t)(flags & 0x3F) << 20);
instr |= ((uint64_t)(size_ptr & 0x3F) << 14);
instr |= ((uint64_t)(shape_ptr & 0x3F) << 8);
instr |= ((uint64_t)(lanes & 0x1F) << 3);
return instr;
}
// ===| build_cvo_instr |===
// Packs a CVO instruction into a 64-bit word.
// Layout (ISA.md §3.4):
// [63:60] opcode (UCA_OP_CVO = 4'h4) 4-bit
// [59:56] cvo_func 4-bit
// [55:39] src_addr 17-bit
// [38:22] dst_addr 17-bit
// [21:6] length 16-bit
// [5:1] flags 5-bit
// [0] async 1-bit
static uint64_t build_cvo_instr(uint8_t cvo_func, uint32_t src_addr,
uint32_t dst_addr, uint16_t length,
uint8_t flags, uint8_t async) {
uint64_t instr = 0;
instr |= ((uint64_t)(UCA_OP_CVO & 0xF) << 60);
instr |= ((uint64_t)(cvo_func & 0xF) << 56);
instr |= ((uint64_t)(src_addr & 0x1FFFF)<< 39);
instr |= ((uint64_t)(dst_addr & 0x1FFFF)<< 22);
instr |= ((uint64_t)(length & 0xFFFF) << 6);
instr |= ((uint64_t)(flags & 0x1F) << 1);
instr |= ((uint64_t)(async & 0x1) << 0);
return instr;
}
// ===| API Init |================================================================
int uca_init(void) {
return uca_hal_init();
}
void uca_deinit(void) {
uca_hal_deinit();
}
// ===| Compute: Vector Core (GEMV) |=============================================
void uca_gemv(uint32_t dest_reg, uint32_t src_addr,
uint8_t flags, uint8_t size_ptr,
uint8_t shape_ptr, uint8_t lanes) {
uint64_t instr = build_compute_instr(UCA_OP_GEMV, dest_reg, src_addr,
flags, size_ptr, shape_ptr, lanes);
uca_hal_issue_instr(instr);
}
// ===| Compute: Matrix Core (GEMM) |=============================================
void uca_gemm(uint32_t dest_reg, uint32_t src_addr,
uint8_t flags, uint8_t size_ptr,
uint8_t shape_ptr, uint8_t lanes) {
uint64_t instr = build_compute_instr(UCA_OP_GEMM, dest_reg, src_addr,
flags, size_ptr, shape_ptr, lanes);
uca_hal_issue_instr(instr);
}
// ===| Compute: CVO Core (Complex Vector Operation Core) |=======================
void uca_cvo(uint8_t cvo_func, uint32_t src_addr,
uint32_t dst_addr, uint16_t length,
uint8_t flags, uint8_t async) {
uint64_t instr = build_cvo_instr(cvo_func, src_addr, dst_addr,
length, flags, async);
uca_hal_issue_instr(instr);
}
// ===| Memory: MEMCPY |=========================================================
void uca_memcpy(uint8_t route, uint32_t dest_addr,
uint32_t src_addr, uint8_t shape_ptr,
uint8_t async) {
// Layout (ISA.md §3.2):
// [63:60] opcode 4-bit
// [59] from_dev 1-bit (upper nibble of route)
// [58] to_dev 1-bit (lower nibble of route)
// [57:41] dest_addr 17-bit
// [40:24] src_addr 17-bit
// [23:7] aux_addr 17-bit (reserved, zero)
// [6:1] shape_ptr 6-bit
// [0] async 1-bit
uint8_t from_dev = (route >> 4) & 0xF;
uint8_t to_dev = (route >> 0) & 0xF;
uint64_t instr = 0;
instr |= ((uint64_t)(UCA_OP_MEMCPY & 0xF) << 60);
instr |= ((uint64_t)(from_dev & 0x1) << 59);
instr |= ((uint64_t)(to_dev & 0x1) << 58);
instr |= ((uint64_t)(dest_addr & 0x1FFFF) << 41);
instr |= ((uint64_t)(src_addr & 0x1FFFF) << 24);
// aux_addr [23:7] left as zero
instr |= ((uint64_t)(shape_ptr & 0x3F) << 1);
instr |= ((uint64_t)(async & 0x1) << 0);
uca_hal_issue_instr(instr);
}
// ===| Memory: MEMSET |=========================================================
void uca_memset(uint8_t dest_cache, uint8_t dest_addr,
uint16_t a, uint16_t b, uint16_t c) {
// Layout (ISA.md §3.3):
// [63:60] opcode 4-bit
// [59:58] dest_cache 2-bit
// [57:52] dest_addr 6-bit
// [51:36] a_value 16-bit
// [35:20] b_value 16-bit
// [19:4] c_value 16-bit
// [3:0] reserved 4-bit
uint64_t instr = 0;
instr |= ((uint64_t)(UCA_OP_MEMSET & 0xF) << 60);
instr |= ((uint64_t)(dest_cache & 0x3) << 58);
instr |= ((uint64_t)(dest_addr & 0x3F) << 52);
instr |= ((uint64_t)(a & 0xFFFF) << 36);
instr |= ((uint64_t)(b & 0xFFFF) << 20);
instr |= ((uint64_t)(c & 0xFFFF) << 4);
uca_hal_issue_instr(instr);
}
// ===| Synchronization |=========================================================
int uca_sync(uint32_t timeout_us) {
return uca_hal_wait_idle(timeout_us);
}
Hardware Abstraction Layer¶
uCA_v1_hal.h— HAL prototypes:pccx_hal_init/deinit/issue_instr/wait_idle.uCA_v1_hal.c— MMIO implementation: opens/dev/mem(or a device-tree handle), maps the AXI-Lite region, and writes VLIWs as paired 32-bit stores at0x00/0x04.
uCA_v1_hal.h
// ===| uCA HAL (Hardware Abstraction Layer) |=====================================
// Low-level AXI-Lite MMIO register access for the uCA NPU.
// This layer owns all physical address reads/writes.
// Nothing above this layer should touch hardware addresses directly.
//
// uCA: micro Compute Architecture — the FPGA NPU driver stack.
// Target: Kria KV260 bare-metal (no OS, no mmap)
// Interface: AXI-Lite (HPM port) at UCA_MMIO_BASE_ADDR
// ================================================================================
#ifndef UCA_V1_HAL_H
#define UCA_V1_HAL_H
#include <stdint.h>
// ===| MMIO Base Address |=======================================================
// Must match the AXI-Lite slave address assigned in the Vivado block design.
#define UCA_MMIO_BASE_ADDR 0xA0000000UL
// ===| Register Offsets |========================================================
// All offsets are byte offsets from UCA_MMIO_BASE_ADDR.
// The 64-bit instruction register is split into two 32-bit words.
// Write LO first; writing HI triggers the NPU instruction latch.
#define UCA_REG_INSTR_LO 0x00 // [31:0] lower 32 bits of 64-bit VLIW instruction
#define UCA_REG_INSTR_HI 0x04 // [63:32] upper 32 bits; writing this latches the instruction
#define UCA_REG_STATUS 0x08 // [31:0] NPU status (read-only)
// ===| Status Register Bit Fields |==============================================
#define UCA_STAT_BUSY (1U << 0) // NPU is executing — do not issue new instruction
#define UCA_STAT_DONE (1U << 1) // Last operation completed successfully
// ===| HAL Init / Teardown |=====================================================
int uca_hal_init(void); // Set MMIO base pointer and verify hardware presence
void uca_hal_deinit(void); // Nullify MMIO base pointer
// ===| Raw Register Access |=====================================================
void uca_hal_write32(uint32_t offset, uint32_t val);
uint32_t uca_hal_read32(uint32_t offset);
// ===| Instruction Issue |=======================================================
// Writes a 64-bit VLIW instruction to the NPU (LO then HI).
// Caller must ensure the NPU is idle (UCA_STAT_BUSY == 0) before calling.
void uca_hal_issue_instr(uint64_t instr);
// ===| Status Polling |==========================================================
uint32_t uca_hal_read_status(void);
int uca_hal_wait_idle(uint32_t timeout_us); // 0 = success, -1 = timeout
#endif // UCA_V1_HAL_H
uCA_v1_hal.c
// ===| uCA HAL Implementation |==================================================
// AXI-Lite MMIO access for bare-metal KV260.
// Direct pointer-based memory-mapped I/O — no OS, no mmap, no syscalls.
// ================================================================================
#include "uCA_v1_hal.h"
#include <stddef.h>
// ===| MMIO Base Pointer |=======================================================
// Volatile: prevents the compiler from optimizing away HW reads/writes.
static volatile uint32_t *g_mmio_base = NULL;
// ===| HAL Init / Teardown |=====================================================
int uca_hal_init(void) {
// On bare-metal KV260, physical addresses are directly accessible.
g_mmio_base = (volatile uint32_t *)UCA_MMIO_BASE_ADDR;
// Sanity check: status register reads all-ones on an unconnected AXI bus.
uint32_t stat = uca_hal_read32(UCA_REG_STATUS);
if (stat == 0xFFFFFFFFU) {
return -1; // Hardware not responding
}
return 0;
}
void uca_hal_deinit(void) {
g_mmio_base = NULL;
}
// ===| Raw Register Access |=====================================================
void uca_hal_write32(uint32_t offset, uint32_t val) {
g_mmio_base[offset / 4] = val;
}
uint32_t uca_hal_read32(uint32_t offset) {
return g_mmio_base[offset / 4];
}
// ===| Instruction Issue |=======================================================
void uca_hal_issue_instr(uint64_t instr) {
// Write lower word first.
// Writing the upper word triggers the NPU instruction latch (ISA §8).
uca_hal_write32(UCA_REG_INSTR_LO, (uint32_t)(instr & 0xFFFFFFFFULL));
uca_hal_write32(UCA_REG_INSTR_HI, (uint32_t)(instr >> 32));
}
// ===| Status Polling |==========================================================
uint32_t uca_hal_read_status(void) {
return uca_hal_read32(UCA_REG_STATUS);
}
int uca_hal_wait_idle(uint32_t timeout_us) {
// Bare-metal busy-wait.
// TODO: replace with a hardware timer once a timer driver is available.
uint32_t count = timeout_us * 400; // ~1 iteration per ns at 400 MHz estimate
while (count--) {
if (!(uca_hal_read_status() & UCA_STAT_BUSY)) {
return 0; // Idle
}
}
return -1; // Timeout
}