호스트 API (C 드라이버)¶
VLIW 명령어를 구성해 NPU 의 AXI-Lite 제어 서피스에 쓰는 호스트 측
C 라이브러리입니다. MMIO 레지스터 접근자 + IRQ 없는 wait_idle
폴링의 얇은 HAL, 그리고 ISA 와 1:1 대응하는 공개 API 로 나뉩니다.
더 보기
- pccx v001 호스트 API 개발자 레퍼런스
같은 API 의 사람이 읽기 쉬운 개발자 레퍼런스.
Public API¶
uCA_v1_api.h— 공개 함수 프로토타입 (pccx_init,pccx_gemv,pccx_gemm,pccx_cvo,pccx_memcpy,pccx_memset,pccx_sync).uCA_v1_api.c— 구현. 오피코드별build_*_instr헬퍼가 인자를 64 비트 VLIW 로 패킹해 HAL 에 전달.
uCA_v1_api.h
// ===| uCA API (High-Level Driver Interface) |====================================
// uCA: micro Compute Architecture — AI model acceleration API for FPGA NPU.
//
// This is the "CUDA equivalent" for the uCA NPU. Application code
// (sw/gemma3NE4B/ and future projects) should call only functions from
// this layer — never touch the HAL directly.
//
// This layer builds 64-bit VLIW instructions per the ISA spec (docs/ISA.md)
// and issues them via the HAL. The NPU frontend is fully decoupled: each
// uca_* call returns immediately after issuing to the instruction FIFO.
// Call uca_sync() to wait for all in-flight operations to complete.
//
// Encoding reference: docs/ISA.md
// ================================================================================
#ifndef UCA_V1_API_H
#define UCA_V1_API_H
#include <stdint.h>
// ===| Opcode Definitions |======================================================
// Must match ISA.md §2 and isa_x64.svh opcode_e
#define UCA_OP_GEMV 0x0
#define UCA_OP_GEMM 0x1
#define UCA_OP_MEMCPY 0x2
#define UCA_OP_MEMSET 0x3
#define UCA_OP_CVO 0x4
// ===| GEMV / GEMM Flags (6-bit) |===============================================
// Must match ISA.md §4 and flags_t in isa_x64.svh
#define UCA_FLAG_FINDEMAX (1U << 5) // Find e_max over output (for softmax)
#define UCA_FLAG_ACCM (1U << 4) // Accumulate into dest (do not overwrite)
#define UCA_FLAG_W_SCALE (1U << 3) // Apply weight scale factor during MAC
// ===| CVO Function Codes (4-bit) |==============================================
// Must match ISA.md §3.4.1 and cvo_func_e in isa_cvo.svh
#define UCA_CVO_EXP 0x0 // Element-wise exp(x) — SFU
#define UCA_CVO_SQRT 0x1 // Element-wise sqrt(x) — SFU
#define UCA_CVO_GELU 0x2 // Element-wise GELU(x) — SFU
#define UCA_CVO_SIN 0x3 // Element-wise sin(x) — CORDIC
#define UCA_CVO_COS 0x4 // Element-wise cos(x) — CORDIC
#define UCA_CVO_REDUCE_SUM 0x5 // Sum reduction → scalar at dst — SFU+Adder
#define UCA_CVO_SCALE 0x6 // Element-wise multiply by scalar — SFU
#define UCA_CVO_RECIP 0x7 // Element-wise 1/x — SFU
// ===| CVO Flags (5-bit) |=======================================================
// Must match ISA.md §3.4.2 and cvo_flags_t in isa_cvo.svh
#define UCA_CVO_FLAG_SUB_EMAX (1U << 4) // Subtract e_max before operation
#define UCA_CVO_FLAG_RECIP_SCALE (1U << 3) // Use 1/scalar for SCALE op
#define UCA_CVO_FLAG_ACCM (1U << 2) // Accumulate into dst
// ===| Memory Route Codes |======================================================
// Must match ISA.md §5 and data_route_e in isa_memctrl.svh
// Upper nibble = from_device, lower nibble = to_device
#define UCA_ROUTE_HOST_TO_L2 0x01
#define UCA_ROUTE_L2_TO_HOST 0x10
#define UCA_ROUTE_L2_TO_L1_GEMM 0x12
#define UCA_ROUTE_L2_TO_L1_GEMV 0x13
#define UCA_ROUTE_GEMV_RES_TO_L2 0x31
#define UCA_ROUTE_GEMM_RES_TO_L2 0x21
#define UCA_ROUTE_CVO_RES_TO_L2 0x41
// ===| API Init |================================================================
int uca_init(void); // Calls uca_hal_init() and verifies NPU is responsive
void uca_deinit(void);
// ===| Compute: Vector Core (GEMV) |=============================================
// Issue a GEMV instruction (INT4 weight × BF16/INT8 activation → BF16 out).
//
// dest_reg : destination register / L2 address (17-bit)
// src_addr : source fmap address (17-bit)
// flags : OR of UCA_FLAG_* constants
// size_ptr : pointer to size descriptor in shape cache (6-bit)
// shape_ptr : pointer to shape descriptor in shape cache (6-bit)
// lanes : number of active parallel μV-Core lanes (5-bit, 1–4)
void uca_gemv(uint32_t dest_reg, uint32_t src_addr,
uint8_t flags, uint8_t size_ptr,
uint8_t shape_ptr, uint8_t lanes);
// ===| Compute: Matrix Core (GEMM) |=============================================
// Issue a GEMM instruction (systolic 32×32 array).
// Same field layout as GEMV; differs only in opcode routing.
void uca_gemm(uint32_t dest_reg, uint32_t src_addr,
uint8_t flags, uint8_t size_ptr,
uint8_t shape_ptr, uint8_t lanes);
// ===| Compute: CVO Core (Complex Vector Operations) |==========================
// Issue a CVO instruction to one of the 2× μCVO-Cores.
// Used for: softmax (EXP, REDUCE_SUM, SCALE), RMSNorm (SQRT, RECIP, SCALE),
// activation functions (GELU), attention (SIN/COS for RoPE).
//
// cvo_func : one of UCA_CVO_* function codes
// src_addr : source address in L2 cache (17-bit)
// dst_addr : destination address in L2 cache (17-bit)
// length : number of elements to process (16-bit)
// flags : OR of UCA_CVO_FLAG_* constants
// async : 0=block until done, 1=fire-and-forget
void uca_cvo(uint8_t cvo_func, uint32_t src_addr,
uint32_t dst_addr, uint16_t length,
uint8_t flags, uint8_t async);
// ===| Memory: MEMCPY |=========================================================
// Issue a DMA transfer between host and NPU memory, or between NPU caches.
//
// route : one of UCA_ROUTE_* constants
// dest_addr : destination address (17-bit)
// src_addr : source address (17-bit)
// shape_ptr : pointer to shape descriptor (6-bit)
// async : 0=blocking, 1=fire-and-forget
void uca_memcpy(uint8_t route, uint32_t dest_addr,
uint32_t src_addr, uint8_t shape_ptr,
uint8_t async);
// ===| Memory: MEMSET |=========================================================
// Set shape descriptor values in the shape cache.
//
// dest_cache : 0=fmap_shape cache, 1=weight_shape cache
// dest_addr : target pointer address in the shape cache (6-bit)
// a, b, c : values to write (16-bit each, typically dimension sizes)
void uca_memset(uint8_t dest_cache, uint8_t dest_addr,
uint16_t a, uint16_t b, uint16_t c);
// ===| Synchronization |=========================================================
// Block until all issued instructions complete (polls UCA_STAT_BUSY).
// Returns 0 on success, -1 on timeout.
int uca_sync(uint32_t timeout_us);
#endif // UCA_V1_API_H
uCA_v1_api.c
// ===| uCA API Implementation |==================================================
// Builds 64-bit VLIW instructions from structured arguments and issues them
// to the NPU via the HAL. Encoding per docs/ISA.md.
// ================================================================================
#include "uCA_v1_api.h"
#include "uCA_v1_hal.h"
// ===| Instruction Builder Helpers |=============================================
// ===| build_compute_instr |===
// Packs GEMV or GEMM instruction into a 64-bit word.
// Layout (ISA.md §3.1):
// [63:60] opcode 4-bit
// [59:43] dest_reg 17-bit
// [42:26] src_addr 17-bit
// [25:20] flags 6-bit
// [19:14] size_ptr 6-bit
// [13:8] shape_ptr 6-bit
// [7:3] lanes 5-bit
// [2:0] reserved 3-bit
static uint64_t build_compute_instr(uint8_t opcode, uint32_t dest_reg,
uint32_t src_addr, uint8_t flags,
uint8_t size_ptr, uint8_t shape_ptr,
uint8_t lanes) {
uint64_t instr = 0;
instr |= ((uint64_t)(opcode & 0xF) << 60);
instr |= ((uint64_t)(dest_reg & 0x1FFFF) << 43);
instr |= ((uint64_t)(src_addr & 0x1FFFF) << 26);
instr |= ((uint64_t)(flags & 0x3F) << 20);
instr |= ((uint64_t)(size_ptr & 0x3F) << 14);
instr |= ((uint64_t)(shape_ptr & 0x3F) << 8);
instr |= ((uint64_t)(lanes & 0x1F) << 3);
return instr;
}
// ===| build_cvo_instr |===
// Packs a CVO instruction into a 64-bit word.
// Layout (ISA.md §3.4):
// [63:60] opcode (UCA_OP_CVO = 4'h4) 4-bit
// [59:56] cvo_func 4-bit
// [55:39] src_addr 17-bit
// [38:22] dst_addr 17-bit
// [21:6] length 16-bit
// [5:1] flags 5-bit
// [0] async 1-bit
static uint64_t build_cvo_instr(uint8_t cvo_func, uint32_t src_addr,
uint32_t dst_addr, uint16_t length,
uint8_t flags, uint8_t async) {
uint64_t instr = 0;
instr |= ((uint64_t)(UCA_OP_CVO & 0xF) << 60);
instr |= ((uint64_t)(cvo_func & 0xF) << 56);
instr |= ((uint64_t)(src_addr & 0x1FFFF)<< 39);
instr |= ((uint64_t)(dst_addr & 0x1FFFF)<< 22);
instr |= ((uint64_t)(length & 0xFFFF) << 6);
instr |= ((uint64_t)(flags & 0x1F) << 1);
instr |= ((uint64_t)(async & 0x1) << 0);
return instr;
}
// ===| API Init |================================================================
int uca_init(void) {
return uca_hal_init();
}
void uca_deinit(void) {
uca_hal_deinit();
}
// ===| Compute: Vector Core (GEMV) |=============================================
void uca_gemv(uint32_t dest_reg, uint32_t src_addr,
uint8_t flags, uint8_t size_ptr,
uint8_t shape_ptr, uint8_t lanes) {
uint64_t instr = build_compute_instr(UCA_OP_GEMV, dest_reg, src_addr,
flags, size_ptr, shape_ptr, lanes);
uca_hal_issue_instr(instr);
}
// ===| Compute: Matrix Core (GEMM) |=============================================
void uca_gemm(uint32_t dest_reg, uint32_t src_addr,
uint8_t flags, uint8_t size_ptr,
uint8_t shape_ptr, uint8_t lanes) {
uint64_t instr = build_compute_instr(UCA_OP_GEMM, dest_reg, src_addr,
flags, size_ptr, shape_ptr, lanes);
uca_hal_issue_instr(instr);
}
// ===| Compute: CVO Core (Complex Vector Operation Core) |=======================
void uca_cvo(uint8_t cvo_func, uint32_t src_addr,
uint32_t dst_addr, uint16_t length,
uint8_t flags, uint8_t async) {
uint64_t instr = build_cvo_instr(cvo_func, src_addr, dst_addr,
length, flags, async);
uca_hal_issue_instr(instr);
}
// ===| Memory: MEMCPY |=========================================================
void uca_memcpy(uint8_t route, uint32_t dest_addr,
uint32_t src_addr, uint8_t shape_ptr,
uint8_t async) {
// Layout (ISA.md §3.2):
// [63:60] opcode 4-bit
// [59] from_dev 1-bit (upper nibble of route)
// [58] to_dev 1-bit (lower nibble of route)
// [57:41] dest_addr 17-bit
// [40:24] src_addr 17-bit
// [23:7] aux_addr 17-bit (reserved, zero)
// [6:1] shape_ptr 6-bit
// [0] async 1-bit
uint8_t from_dev = (route >> 4) & 0xF;
uint8_t to_dev = (route >> 0) & 0xF;
uint64_t instr = 0;
instr |= ((uint64_t)(UCA_OP_MEMCPY & 0xF) << 60);
instr |= ((uint64_t)(from_dev & 0x1) << 59);
instr |= ((uint64_t)(to_dev & 0x1) << 58);
instr |= ((uint64_t)(dest_addr & 0x1FFFF) << 41);
instr |= ((uint64_t)(src_addr & 0x1FFFF) << 24);
// aux_addr [23:7] left as zero
instr |= ((uint64_t)(shape_ptr & 0x3F) << 1);
instr |= ((uint64_t)(async & 0x1) << 0);
uca_hal_issue_instr(instr);
}
// ===| Memory: MEMSET |=========================================================
void uca_memset(uint8_t dest_cache, uint8_t dest_addr,
uint16_t a, uint16_t b, uint16_t c) {
// Layout (ISA.md §3.3):
// [63:60] opcode 4-bit
// [59:58] dest_cache 2-bit
// [57:52] dest_addr 6-bit
// [51:36] a_value 16-bit
// [35:20] b_value 16-bit
// [19:4] c_value 16-bit
// [3:0] reserved 4-bit
uint64_t instr = 0;
instr |= ((uint64_t)(UCA_OP_MEMSET & 0xF) << 60);
instr |= ((uint64_t)(dest_cache & 0x3) << 58);
instr |= ((uint64_t)(dest_addr & 0x3F) << 52);
instr |= ((uint64_t)(a & 0xFFFF) << 36);
instr |= ((uint64_t)(b & 0xFFFF) << 20);
instr |= ((uint64_t)(c & 0xFFFF) << 4);
uca_hal_issue_instr(instr);
}
// ===| Synchronization |=========================================================
int uca_sync(uint32_t timeout_us) {
return uca_hal_wait_idle(timeout_us);
}
하드웨어 추상화 계층 (HAL)¶
uCA_v1_hal.h— HAL 프로토타입:pccx_hal_init/deinit/issue_instr/wait_idle.uCA_v1_hal.c— MMIO 구현./dev/mem(또는 device tree 핸들) 을 열어 AXI-Lite 영역을 매핑하고,0x00/0x04에 32 비트 쓰기 한 쌍으로 VLIW 를 발행.
uCA_v1_hal.h
// ===| uCA HAL (Hardware Abstraction Layer) |=====================================
// Low-level AXI-Lite MMIO register access for the uCA NPU.
// This layer owns all physical address reads/writes.
// Nothing above this layer should touch hardware addresses directly.
//
// uCA: micro Compute Architecture — the FPGA NPU driver stack.
// Target: Kria KV260 bare-metal (no OS, no mmap)
// Interface: AXI-Lite (HPM port) at UCA_MMIO_BASE_ADDR
// ================================================================================
#ifndef UCA_V1_HAL_H
#define UCA_V1_HAL_H
#include <stdint.h>
// ===| MMIO Base Address |=======================================================
// Must match the AXI-Lite slave address assigned in the Vivado block design.
#define UCA_MMIO_BASE_ADDR 0xA0000000UL
// ===| Register Offsets |========================================================
// All offsets are byte offsets from UCA_MMIO_BASE_ADDR.
// The 64-bit instruction register is split into two 32-bit words.
// Write LO first; writing HI triggers the NPU instruction latch.
#define UCA_REG_INSTR_LO 0x00 // [31:0] lower 32 bits of 64-bit VLIW instruction
#define UCA_REG_INSTR_HI 0x04 // [63:32] upper 32 bits; writing this latches the instruction
#define UCA_REG_STATUS 0x08 // [31:0] NPU status (read-only)
// ===| Status Register Bit Fields |==============================================
#define UCA_STAT_BUSY (1U << 0) // NPU is executing — do not issue new instruction
#define UCA_STAT_DONE (1U << 1) // Last operation completed successfully
// ===| HAL Init / Teardown |=====================================================
int uca_hal_init(void); // Set MMIO base pointer and verify hardware presence
void uca_hal_deinit(void); // Nullify MMIO base pointer
// ===| Raw Register Access |=====================================================
void uca_hal_write32(uint32_t offset, uint32_t val);
uint32_t uca_hal_read32(uint32_t offset);
// ===| Instruction Issue |=======================================================
// Writes a 64-bit VLIW instruction to the NPU (LO then HI).
// Caller must ensure the NPU is idle (UCA_STAT_BUSY == 0) before calling.
void uca_hal_issue_instr(uint64_t instr);
// ===| Status Polling |==========================================================
uint32_t uca_hal_read_status(void);
int uca_hal_wait_idle(uint32_t timeout_us); // 0 = success, -1 = timeout
#endif // UCA_V1_HAL_H
uCA_v1_hal.c
// ===| uCA HAL Implementation |==================================================
// AXI-Lite MMIO access for bare-metal KV260.
// Direct pointer-based memory-mapped I/O — no OS, no mmap, no syscalls.
// ================================================================================
#include "uCA_v1_hal.h"
#include <stddef.h>
// ===| MMIO Base Pointer |=======================================================
// Volatile: prevents the compiler from optimizing away HW reads/writes.
static volatile uint32_t *g_mmio_base = NULL;
// ===| HAL Init / Teardown |=====================================================
int uca_hal_init(void) {
// On bare-metal KV260, physical addresses are directly accessible.
g_mmio_base = (volatile uint32_t *)UCA_MMIO_BASE_ADDR;
// Sanity check: status register reads all-ones on an unconnected AXI bus.
uint32_t stat = uca_hal_read32(UCA_REG_STATUS);
if (stat == 0xFFFFFFFFU) {
return -1; // Hardware not responding
}
return 0;
}
void uca_hal_deinit(void) {
g_mmio_base = NULL;
}
// ===| Raw Register Access |=====================================================
void uca_hal_write32(uint32_t offset, uint32_t val) {
g_mmio_base[offset / 4] = val;
}
uint32_t uca_hal_read32(uint32_t offset) {
return g_mmio_base[offset / 4];
}
// ===| Instruction Issue |=======================================================
void uca_hal_issue_instr(uint64_t instr) {
// Write lower word first.
// Writing the upper word triggers the NPU instruction latch (ISA §8).
uca_hal_write32(UCA_REG_INSTR_LO, (uint32_t)(instr & 0xFFFFFFFFULL));
uca_hal_write32(UCA_REG_INSTR_HI, (uint32_t)(instr >> 32));
}
// ===| Status Polling |==========================================================
uint32_t uca_hal_read_status(void) {
return uca_hal_read32(UCA_REG_STATUS);
}
int uca_hal_wait_idle(uint32_t timeout_us) {
// Bare-metal busy-wait.
// TODO: replace with a hardware timer once a timer driver is available.
uint32_t count = timeout_us * 400; // ~1 iteration per ns at 400 MHz estimate
while (count--) {
if (!(uca_hal_read_status() & UCA_STAT_BUSY)) {
return 0; // Idle
}
}
return -1; // Timeout
}