호스트 API (C 드라이버)

VLIW 명령어를 구성해 NPU 의 AXI-Lite 제어 서피스에 쓰는 호스트 측 C 라이브러리입니다. MMIO 레지스터 접근자 + IRQ 없는 wait_idle 폴링의 얇은 HAL, 그리고 ISA 와 1:1 대응하는 공개 API 로 나뉩니다.

더 보기

pccx v001 호스트 API 개발자 레퍼런스

같은 API 의 사람이 읽기 쉬운 개발자 레퍼런스.

Public API

  • uCA_v1_api.h — 공개 함수 프로토타입 (pccx_init, pccx_gemv, pccx_gemm, pccx_cvo, pccx_memcpy, pccx_memset, pccx_sync).

  • uCA_v1_api.c — 구현. 오피코드별 build_*_instr 헬퍼가 인자를 64 비트 VLIW 로 패킹해 HAL 에 전달.

uCA_v1_api.h
// ===| uCA API (High-Level Driver Interface) |====================================
// uCA: micro Compute Architecture — AI model acceleration API for FPGA NPU.
//
// This is the "CUDA equivalent" for the uCA NPU. Application code
// (sw/gemma3NE4B/ and future projects) should call only functions from
// this layer — never touch the HAL directly.
//
// This layer builds 64-bit VLIW instructions per the ISA spec (docs/ISA.md)
// and issues them via the HAL. The NPU frontend is fully decoupled: each
// uca_* call returns immediately after issuing to the instruction FIFO.
// Call uca_sync() to wait for all in-flight operations to complete.
//
// Encoding reference: docs/ISA.md
// ================================================================================

#ifndef UCA_V1_API_H
#define UCA_V1_API_H

#include <stdint.h>

// ===| Opcode Definitions |======================================================
// Must match ISA.md §2 and isa_x64.svh opcode_e
#define UCA_OP_GEMV    0x0
#define UCA_OP_GEMM    0x1
#define UCA_OP_MEMCPY  0x2
#define UCA_OP_MEMSET  0x3
#define UCA_OP_CVO     0x4

// ===| GEMV / GEMM Flags (6-bit) |===============================================
// Must match ISA.md §4 and flags_t in isa_x64.svh
#define UCA_FLAG_FINDEMAX  (1U << 5)  // Find e_max over output (for softmax)
#define UCA_FLAG_ACCM      (1U << 4)  // Accumulate into dest (do not overwrite)
#define UCA_FLAG_W_SCALE   (1U << 3)  // Apply weight scale factor during MAC

// ===| CVO Function Codes (4-bit) |==============================================
// Must match ISA.md §3.4.1 and cvo_func_e in isa_cvo.svh
#define UCA_CVO_EXP          0x0  // Element-wise exp(x)             — SFU
#define UCA_CVO_SQRT         0x1  // Element-wise sqrt(x)            — SFU
#define UCA_CVO_GELU         0x2  // Element-wise GELU(x)            — SFU
#define UCA_CVO_SIN          0x3  // Element-wise sin(x)             — CORDIC
#define UCA_CVO_COS          0x4  // Element-wise cos(x)             — CORDIC
#define UCA_CVO_REDUCE_SUM   0x5  // Sum reduction → scalar at dst   — SFU+Adder
#define UCA_CVO_SCALE        0x6  // Element-wise multiply by scalar — SFU
#define UCA_CVO_RECIP        0x7  // Element-wise 1/x                — SFU

// ===| CVO Flags (5-bit) |=======================================================
// Must match ISA.md §3.4.2 and cvo_flags_t in isa_cvo.svh
#define UCA_CVO_FLAG_SUB_EMAX      (1U << 4)  // Subtract e_max before operation
#define UCA_CVO_FLAG_RECIP_SCALE   (1U << 3)  // Use 1/scalar for SCALE op
#define UCA_CVO_FLAG_ACCM          (1U << 2)  // Accumulate into dst

// ===| Memory Route Codes |======================================================
// Must match ISA.md §5 and data_route_e in isa_memctrl.svh
// Upper nibble = from_device, lower nibble = to_device
#define UCA_ROUTE_HOST_TO_L2        0x01
#define UCA_ROUTE_L2_TO_HOST        0x10
#define UCA_ROUTE_L2_TO_L1_GEMM    0x12
#define UCA_ROUTE_L2_TO_L1_GEMV    0x13
#define UCA_ROUTE_GEMV_RES_TO_L2   0x31
#define UCA_ROUTE_GEMM_RES_TO_L2   0x21
#define UCA_ROUTE_CVO_RES_TO_L2    0x41

// ===| API Init |================================================================
int  uca_init(void);    // Calls uca_hal_init() and verifies NPU is responsive
void uca_deinit(void);

// ===| Compute: Vector Core (GEMV) |=============================================
// Issue a GEMV instruction (INT4 weight × BF16/INT8 activation → BF16 out).
//
//   dest_reg   : destination register / L2 address (17-bit)
//   src_addr   : source fmap address (17-bit)
//   flags      : OR of UCA_FLAG_* constants
//   size_ptr   : pointer to size descriptor in shape cache (6-bit)
//   shape_ptr  : pointer to shape descriptor in shape cache (6-bit)
//   lanes      : number of active parallel μV-Core lanes (5-bit, 1–4)
void uca_gemv(uint32_t dest_reg,   uint32_t src_addr,
              uint8_t  flags,      uint8_t  size_ptr,
              uint8_t  shape_ptr,  uint8_t  lanes);

// ===| Compute: Matrix Core (GEMM) |=============================================
// Issue a GEMM instruction (systolic 32×32 array).
// Same field layout as GEMV; differs only in opcode routing.
void uca_gemm(uint32_t dest_reg,   uint32_t src_addr,
              uint8_t  flags,      uint8_t  size_ptr,
              uint8_t  shape_ptr,  uint8_t  lanes);

// ===| Compute: CVO Core (Complex Vector Operations) |==========================
// Issue a CVO instruction to one of the 2× μCVO-Cores.
// Used for: softmax (EXP, REDUCE_SUM, SCALE), RMSNorm (SQRT, RECIP, SCALE),
//           activation functions (GELU), attention (SIN/COS for RoPE).
//
//   cvo_func   : one of UCA_CVO_* function codes
//   src_addr   : source address in L2 cache (17-bit)
//   dst_addr   : destination address in L2 cache (17-bit)
//   length     : number of elements to process (16-bit)
//   flags      : OR of UCA_CVO_FLAG_* constants
//   async      : 0=block until done, 1=fire-and-forget
void uca_cvo(uint8_t  cvo_func,   uint32_t src_addr,
             uint32_t dst_addr,   uint16_t length,
             uint8_t  flags,      uint8_t  async);

// ===| Memory: MEMCPY |=========================================================
// Issue a DMA transfer between host and NPU memory, or between NPU caches.
//
//   route      : one of UCA_ROUTE_* constants
//   dest_addr  : destination address (17-bit)
//   src_addr   : source address (17-bit)
//   shape_ptr  : pointer to shape descriptor (6-bit)
//   async      : 0=blocking, 1=fire-and-forget
void uca_memcpy(uint8_t  route,    uint32_t dest_addr,
                uint32_t src_addr, uint8_t  shape_ptr,
                uint8_t  async);

// ===| Memory: MEMSET |=========================================================
// Set shape descriptor values in the shape cache.
//
//   dest_cache : 0=fmap_shape cache, 1=weight_shape cache
//   dest_addr  : target pointer address in the shape cache (6-bit)
//   a, b, c    : values to write (16-bit each, typically dimension sizes)
void uca_memset(uint8_t  dest_cache, uint8_t  dest_addr,
                uint16_t a,          uint16_t b,  uint16_t c);

// ===| Synchronization |=========================================================
// Block until all issued instructions complete (polls UCA_STAT_BUSY).
// Returns 0 on success, -1 on timeout.
int uca_sync(uint32_t timeout_us);

#endif // UCA_V1_API_H
uCA_v1_api.c
// ===| uCA API Implementation |==================================================
// Builds 64-bit VLIW instructions from structured arguments and issues them
// to the NPU via the HAL. Encoding per docs/ISA.md.
// ================================================================================

#include "uCA_v1_api.h"
#include "uCA_v1_hal.h"

// ===| Instruction Builder Helpers |=============================================

// ===| build_compute_instr |===
// Packs GEMV or GEMM instruction into a 64-bit word.
// Layout (ISA.md §3.1):
//   [63:60] opcode        4-bit
//   [59:43] dest_reg     17-bit
//   [42:26] src_addr     17-bit
//   [25:20] flags         6-bit
//   [19:14] size_ptr      6-bit
//   [13:8]  shape_ptr     6-bit
//   [7:3]   lanes         5-bit
//   [2:0]   reserved      3-bit
static uint64_t build_compute_instr(uint8_t  opcode,    uint32_t dest_reg,
                                    uint32_t src_addr,  uint8_t  flags,
                                    uint8_t  size_ptr,  uint8_t  shape_ptr,
                                    uint8_t  lanes) {
    uint64_t instr = 0;
    instr |= ((uint64_t)(opcode    & 0xF)     << 60);
    instr |= ((uint64_t)(dest_reg  & 0x1FFFF) << 43);
    instr |= ((uint64_t)(src_addr  & 0x1FFFF) << 26);
    instr |= ((uint64_t)(flags     & 0x3F)    << 20);
    instr |= ((uint64_t)(size_ptr  & 0x3F)    << 14);
    instr |= ((uint64_t)(shape_ptr & 0x3F)    <<  8);
    instr |= ((uint64_t)(lanes     & 0x1F)    <<  3);
    return instr;
}

// ===| build_cvo_instr |===
// Packs a CVO instruction into a 64-bit word.
// Layout (ISA.md §3.4):
//   [63:60] opcode (UCA_OP_CVO = 4'h4)   4-bit
//   [59:56] cvo_func                      4-bit
//   [55:39] src_addr                     17-bit
//   [38:22] dst_addr                     17-bit
//   [21:6]  length                       16-bit
//   [5:1]   flags                         5-bit
//   [0]     async                         1-bit
static uint64_t build_cvo_instr(uint8_t  cvo_func,  uint32_t src_addr,
                                uint32_t dst_addr,  uint16_t length,
                                uint8_t  flags,     uint8_t  async) {
    uint64_t instr = 0;
    instr |= ((uint64_t)(UCA_OP_CVO & 0xF)   << 60);
    instr |= ((uint64_t)(cvo_func  & 0xF)    << 56);
    instr |= ((uint64_t)(src_addr  & 0x1FFFF)<< 39);
    instr |= ((uint64_t)(dst_addr  & 0x1FFFF)<< 22);
    instr |= ((uint64_t)(length    & 0xFFFF) <<  6);
    instr |= ((uint64_t)(flags     & 0x1F)   <<  1);
    instr |= ((uint64_t)(async     & 0x1)    <<  0);
    return instr;
}

// ===| API Init |================================================================
int uca_init(void) {
    return uca_hal_init();
}

void uca_deinit(void) {
    uca_hal_deinit();
}

// ===| Compute: Vector Core (GEMV) |=============================================
void uca_gemv(uint32_t dest_reg,   uint32_t src_addr,
              uint8_t  flags,      uint8_t  size_ptr,
              uint8_t  shape_ptr,  uint8_t  lanes) {
    uint64_t instr = build_compute_instr(UCA_OP_GEMV, dest_reg, src_addr,
                                         flags, size_ptr, shape_ptr, lanes);
    uca_hal_issue_instr(instr);
}

// ===| Compute: Matrix Core (GEMM) |=============================================
void uca_gemm(uint32_t dest_reg,   uint32_t src_addr,
              uint8_t  flags,      uint8_t  size_ptr,
              uint8_t  shape_ptr,  uint8_t  lanes) {
    uint64_t instr = build_compute_instr(UCA_OP_GEMM, dest_reg, src_addr,
                                         flags, size_ptr, shape_ptr, lanes);
    uca_hal_issue_instr(instr);
}

// ===| Compute: CVO Core (Complex Vector Operation Core) |=======================
void uca_cvo(uint8_t  cvo_func,  uint32_t src_addr,
             uint32_t dst_addr,  uint16_t length,
             uint8_t  flags,     uint8_t  async) {
    uint64_t instr = build_cvo_instr(cvo_func, src_addr, dst_addr,
                                      length, flags, async);
    uca_hal_issue_instr(instr);
}

// ===| Memory: MEMCPY |=========================================================
void uca_memcpy(uint8_t  route,    uint32_t dest_addr,
                uint32_t src_addr, uint8_t  shape_ptr,
                uint8_t  async) {
    // Layout (ISA.md §3.2):
    //   [63:60] opcode     4-bit
    //   [59]    from_dev   1-bit  (upper nibble of route)
    //   [58]    to_dev     1-bit  (lower nibble of route)
    //   [57:41] dest_addr 17-bit
    //   [40:24] src_addr  17-bit
    //   [23:7]  aux_addr  17-bit  (reserved, zero)
    //   [6:1]   shape_ptr  6-bit
    //   [0]     async      1-bit
    uint8_t from_dev = (route >> 4) & 0xF;
    uint8_t to_dev   = (route >> 0) & 0xF;

    uint64_t instr = 0;
    instr |= ((uint64_t)(UCA_OP_MEMCPY & 0xF) << 60);
    instr |= ((uint64_t)(from_dev  & 0x1)      << 59);
    instr |= ((uint64_t)(to_dev    & 0x1)      << 58);
    instr |= ((uint64_t)(dest_addr & 0x1FFFF)  << 41);
    instr |= ((uint64_t)(src_addr  & 0x1FFFF)  << 24);
    // aux_addr [23:7] left as zero
    instr |= ((uint64_t)(shape_ptr & 0x3F)     <<  1);
    instr |= ((uint64_t)(async     & 0x1)      <<  0);
    uca_hal_issue_instr(instr);
}

// ===| Memory: MEMSET |=========================================================
void uca_memset(uint8_t  dest_cache, uint8_t  dest_addr,
                uint16_t a,          uint16_t b,  uint16_t c) {
    // Layout (ISA.md §3.3):
    //   [63:60] opcode      4-bit
    //   [59:58] dest_cache  2-bit
    //   [57:52] dest_addr   6-bit
    //   [51:36] a_value    16-bit
    //   [35:20] b_value    16-bit
    //   [19:4]  c_value    16-bit
    //   [3:0]   reserved    4-bit
    uint64_t instr = 0;
    instr |= ((uint64_t)(UCA_OP_MEMSET & 0xF) << 60);
    instr |= ((uint64_t)(dest_cache & 0x3)     << 58);
    instr |= ((uint64_t)(dest_addr  & 0x3F)    << 52);
    instr |= ((uint64_t)(a          & 0xFFFF)  << 36);
    instr |= ((uint64_t)(b          & 0xFFFF)  << 20);
    instr |= ((uint64_t)(c          & 0xFFFF)  <<  4);
    uca_hal_issue_instr(instr);
}

// ===| Synchronization |=========================================================
int uca_sync(uint32_t timeout_us) {
    return uca_hal_wait_idle(timeout_us);
}

하드웨어 추상화 계층 (HAL)

  • uCA_v1_hal.h — HAL 프로토타입: pccx_hal_init / deinit / issue_instr / wait_idle.

  • uCA_v1_hal.c — MMIO 구현. /dev/mem (또는 device tree 핸들) 을 열어 AXI-Lite 영역을 매핑하고, 0x00 / 0x04 에 32 비트 쓰기 한 쌍으로 VLIW 를 발행.

uCA_v1_hal.h
// ===| uCA HAL (Hardware Abstraction Layer) |=====================================
// Low-level AXI-Lite MMIO register access for the uCA NPU.
// This layer owns all physical address reads/writes.
// Nothing above this layer should touch hardware addresses directly.
//
// uCA: micro Compute Architecture — the FPGA NPU driver stack.
// Target: Kria KV260 bare-metal (no OS, no mmap)
// Interface: AXI-Lite (HPM port) at UCA_MMIO_BASE_ADDR
// ================================================================================

#ifndef UCA_V1_HAL_H
#define UCA_V1_HAL_H

#include <stdint.h>

// ===| MMIO Base Address |=======================================================
// Must match the AXI-Lite slave address assigned in the Vivado block design.
#define UCA_MMIO_BASE_ADDR  0xA0000000UL

// ===| Register Offsets |========================================================
// All offsets are byte offsets from UCA_MMIO_BASE_ADDR.
// The 64-bit instruction register is split into two 32-bit words.
// Write LO first; writing HI triggers the NPU instruction latch.
#define UCA_REG_INSTR_LO    0x00  // [31:0]  lower 32 bits of 64-bit VLIW instruction
#define UCA_REG_INSTR_HI    0x04  // [63:32] upper 32 bits; writing this latches the instruction
#define UCA_REG_STATUS      0x08  // [31:0]  NPU status (read-only)

// ===| Status Register Bit Fields |==============================================
#define UCA_STAT_BUSY       (1U << 0)  // NPU is executing — do not issue new instruction
#define UCA_STAT_DONE       (1U << 1)  // Last operation completed successfully

// ===| HAL Init / Teardown |=====================================================
int  uca_hal_init(void);    // Set MMIO base pointer and verify hardware presence
void uca_hal_deinit(void);  // Nullify MMIO base pointer

// ===| Raw Register Access |=====================================================
void     uca_hal_write32(uint32_t offset, uint32_t val);
uint32_t uca_hal_read32(uint32_t offset);

// ===| Instruction Issue |=======================================================
// Writes a 64-bit VLIW instruction to the NPU (LO then HI).
// Caller must ensure the NPU is idle (UCA_STAT_BUSY == 0) before calling.
void uca_hal_issue_instr(uint64_t instr);

// ===| Status Polling |==========================================================
uint32_t uca_hal_read_status(void);
int      uca_hal_wait_idle(uint32_t timeout_us);  // 0 = success, -1 = timeout

#endif // UCA_V1_HAL_H
uCA_v1_hal.c
// ===| uCA HAL Implementation |==================================================
// AXI-Lite MMIO access for bare-metal KV260.
// Direct pointer-based memory-mapped I/O — no OS, no mmap, no syscalls.
// ================================================================================

#include "uCA_v1_hal.h"
#include <stddef.h>

// ===| MMIO Base Pointer |=======================================================
// Volatile: prevents the compiler from optimizing away HW reads/writes.
static volatile uint32_t *g_mmio_base = NULL;

// ===| HAL Init / Teardown |=====================================================
int uca_hal_init(void) {
    // On bare-metal KV260, physical addresses are directly accessible.
    g_mmio_base = (volatile uint32_t *)UCA_MMIO_BASE_ADDR;

    // Sanity check: status register reads all-ones on an unconnected AXI bus.
    uint32_t stat = uca_hal_read32(UCA_REG_STATUS);
    if (stat == 0xFFFFFFFFU) {
        return -1;  // Hardware not responding
    }
    return 0;
}

void uca_hal_deinit(void) {
    g_mmio_base = NULL;
}

// ===| Raw Register Access |=====================================================
void uca_hal_write32(uint32_t offset, uint32_t val) {
    g_mmio_base[offset / 4] = val;
}

uint32_t uca_hal_read32(uint32_t offset) {
    return g_mmio_base[offset / 4];
}

// ===| Instruction Issue |=======================================================
void uca_hal_issue_instr(uint64_t instr) {
    // Write lower word first.
    // Writing the upper word triggers the NPU instruction latch (ISA §8).
    uca_hal_write32(UCA_REG_INSTR_LO, (uint32_t)(instr & 0xFFFFFFFFULL));
    uca_hal_write32(UCA_REG_INSTR_HI, (uint32_t)(instr >> 32));
}

// ===| Status Polling |==========================================================
uint32_t uca_hal_read_status(void) {
    return uca_hal_read32(UCA_REG_STATUS);
}

int uca_hal_wait_idle(uint32_t timeout_us) {
    // Bare-metal busy-wait.
    // TODO: replace with a hardware timer once a timer driver is available.
    uint32_t count = timeout_us * 400;  // ~1 iteration per ns at 400 MHz estimate
    while (count--) {
        if (!(uca_hal_read_status() & UCA_STAT_BUSY)) {
            return 0;  // Idle
        }
    }
    return -1;  // Timeout
}