Last updated: 2026-06-12
API Server
Runtime
Backend-specific server runtime aliases and wrappers.
Keeps the HTTP/routes layer shared across Vulkan and Metal backends.
20 exports shown
constant
is_metal
pub const is_metal = gpu.is_metal Whether the active GPU backend is Apple Metal.
constant
is_vulkan
pub const is_vulkan = gpu.is_vulkan Whether the active GPU backend is Vulkan.
constant
supports_model_management
pub const supports_model_management = gpu.is_vulkan or gpu.is_metal Whether the backend supports loading/unloading models at runtime.
constant
supports_sampling_controls
pub const supports_sampling_controls = gpu.is_vulkan or gpu.is_metal Whether the backend supports temperature, top-p, top-k, and repetition penalty.
constant
supports_runtime_profiling
pub const supports_runtime_profiling = gpu.is_vulkan or gpu.is_metal Whether the backend supports GPU kernel profiling during inference.
constant
tokenizer_mod
pub const tokenizer_mod = @import("../model/tokenizer.zig") Tokenizer module (shared across all backends).
constant
forward_mod
pub const forward_mod = if (gpu.is_metal) @import("../compute/forward_metal.zig") else @import("../compute/forward.zig") Forward-pass module, selected by the active backend.
constant
loader_mod
pub const loader_mod = if (gpu.is_metal) @import("../model/loader_metal.zig") else @import("../model/loader.zig") Model-loading module, selected by the active backend.
constant
model_manager_mod
pub const model_manager_mod = if (gpu.is_metal) @import("model_manager_metal.zig") else @import("model_manager.zig") Model-manager module, selected by the active backend.
constant
InferenceEngine
pub const InferenceEngine = forward_mod.InferenceEngine Backend-specific inference engine that runs the forward pass.
constant
DecodeState
pub const DecodeState = forward_mod.DecodeState Per-sequence decode state (KV cache position, token history, etc.).
constant
Model
pub const Model = loader_mod.Model Loaded model handle (weights, hyperparams, GGUF metadata).
constant
ModelManager
pub const ModelManager = model_manager_mod.ModelManager Manages loading, unloading, and switching between models at runtime.
constant
SamplingParams
pub const SamplingParams = forward_mod.SamplingParams Token sampling parameters (shared across Vulkan and Metal backends).
function
enableLogitsReadback
pub fn enableLogitsReadback(_engine: *InferenceEngine) void Enable logits readback from GPU so sampling can inspect raw logits.
On Metal (UMA) logits are always CPU-accessible, so this is a no-op.
function
logitsReadbackEnabled
pub fn logitsReadbackEnabled(_engine: *const InferenceEngine) bool Return whether logits readback is currently enabled on the engine.
Always returns `true` on Metal because UMA makes logits CPU-accessible without an explicit readback step.
function
setLogitsReadbackEnabled
pub fn setLogitsReadbackEnabled(_engine: *InferenceEngine, _enabled: bool) void Set the logits readback intent flag on backends that can elide full logit materialization.
function
enableProfiling
pub fn enableProfiling(_engine: *InferenceEngine) !void Enable GPU kernel profiling on the inference engine.
Supported on both Vulkan and Metal backends; calls the backend's own `enableProfiling` method.
function
decodeStep
pub fn decodeStep( _engine: *InferenceEngine, _state: *DecodeState, _token_id: u32, _collect_output: bool, ) !void Run a single autoregressive decode step, advancing the KV cache by one token.
function
sample
pub fn sample( _engine: *const InferenceEngine, _state: *const DecodeState, _params: SamplingParams, _random: std.Random, ) u32 Sample the next token from the model's logit distribution.