Runtime default — npu for llama_cpp, npu for qairt.
"npu"
Hexagon NPU acceleration. Recommended on Snapdragon.
"gpu"
Adreno GPU via OpenCL (llama_cpp only).
"cpu"
Pure CPU. Forces nGpuLayers = 0.
Qualcomm AI Engine Direct only supports NPU. Passing "cpu" or "gpu" with a Qualcomm AI Hub Model logs a warning and falls back to NPU — it won’t error.
data class ModelPullInput( val model_name: String, // "org/repo" or alias val precision: String? = null, // precision (quantization) e.g. "Q4_0", "Q4_K_M" val hub: HubSource = HubSource.AUTO, // AUTO routes by model_name val local_path: String? = null, // only when hub == LOCALFS val hf_token: String? = null, // falls back to GENIEX_HFTOKEN env val chipset: String? = null, // required for Qualcomm AI Hub on Android (e.g. "SM8750") val display_name: String? = null,)
Returned by getPaths(). Feed fields directly into LlmCreateInput / VlmCreateInput:
data class ModelPaths( val model_path: String, val model_dir: String, val model_name: String, val runtime_id: String, // authoritative — prefer over UI selection val mmproj_path: String? = null, // VLM projection weights val tokenizer_path: String? = null, val compute_unit: String? = null,)
Qualcomm AI Hub pulls on Android require an explicit chipset. The Rust side only auto-detects on Windows on Snapdragon. Use "SM8750" for Snapdragon 8 Elite or "SM8850" for Snapdragon 8 Elite Gen 5.
data class LlmCreateInput( val model_name: String, val model_path: String, val tokenizer_path: String? = null, val config: ModelConfig, val runtime_id: String? = null, val compute_unit: String? = null,)
data class VlmCreateInput( val model_name: String, val model_path: String, val mmproj_path: String? = null, // vision projection weights (GGUF VLMs) val config: ModelConfig, val runtime_id: String? = null, val compute_unit: String? = null,)
data class ModelConfig( var nCtx: Int = 2048, // context size; 0 = model default var nThreads: Int = 8, var nThreadsBatch: Int = 8, var nBatch: Int = 2048, var nUBatch: Int = 512, var nSeqMax: Int = 1, var nGpuLayers: Int = 0, val chat_template_path: String = "", val chat_template_content: String = "", val max_tokens: Int = 2048, val enable_thinking: Boolean = false, val verbose: Boolean = false,)
nGpuLayers is rewritten by the JNI based on compute_unit: cpu forces 0, npu forces 999. For gpu the value you pass is used as-is — set 999 to offload all layers.
data class VlmChatMessage( val role: String?, // "system" | "user" | "assistant" val contents: List<VlmContent>,)data class VlmContent( val type: String?, // "text" | "image" val text: String?, // text content, or absolute file path for image)
data class GenerationConfig( var maxTokens: Int = 32, var stopWords: Array<String>? = null, var stopCount: Int = 0, var nPast: Int = 0, var samplerConfig: SamplerConfig? = null, var imagePaths: Array<String>? = null, var imageCount: Int = 0, var audioPaths: Array<String>? = null, var audioCount: Int = 0,)
The default maxTokens is 32. Most use cases should set a higher value (e.g. maxTokens = 2048).
sealed class LlmStreamResult { data class Token(val text: String) : LlmStreamResult() data class Completed(val profile: ProfilingData) : LlmStreamResult() data class Error(val throwable: Throwable) : LlmStreamResult()}
GGUF VLMs need two artifacts: the LLM weights (model_path) and the vision projection (mmproj_path). Both come from getPaths():
val paths = ModelManagerWrapper.getPaths("unsloth/Qwen3-VL-2B-Instruct-GGUF") ?: error("Model not downloaded")VlmWrapper.builder() .vlmCreateInput( VlmCreateInput( model_name = paths.model_name, model_path = paths.model_path, mmproj_path = paths.mmproj_path, config = ModelConfig(nCtx = 4096), runtime_id = "llama_cpp", compute_unit = null, ) ) .build() .onSuccess { vlmWrapper = it }val msg = VlmChatMessage( role = "user", contents = listOf( VlmContent("image", "/storage/emulated/0/Pictures/example.jpg"), VlmContent("text", "Describe this image."), ),)val chat = arrayListOf(msg)vlmWrapper.applyChatTemplate(chat.toTypedArray(), null, false).onSuccess { t -> val gen = vlmWrapper.injectMediaPathsToConfig(chat.toTypedArray(), GenerationConfig(maxTokens = 2048)) vlmWrapper.generateStreamFlow(t.formattedText, gen).collect { result -> when (result) { is LlmStreamResult.Token -> print(result.text) is LlmStreamResult.Completed -> println("\nDone") is LlmStreamResult.Error -> println("Error: ${result.throwable}") } }}
Always pass t.formattedText (the chat-templated prompt) into generateStreamFlow, not the raw user text. The native pipeline treats the prompt as already-templated.
val paths = ModelManagerWrapper.getPaths("ai-hub-models/Qwen3-4B-Instruct-2507") ?: error("Model not downloaded")LlmWrapper.builder() .llmCreateInput( LlmCreateInput( model_name = paths.model_name, model_path = paths.model_path, config = ModelConfig(max_tokens = 2048, enable_thinking = false), runtime_id = "qairt", compute_unit = null, // null → NPU (only option for Qualcomm AI Engine Direct) ) ) .build() .onSuccess { llmWrapper = it } .onFailure { println("Error: ${it.message}") }val chat = arrayListOf(ChatMessage("user", "What is AI?"))llmWrapper.applyChatTemplate(chat.toTypedArray(), null, false).onSuccess { t -> llmWrapper.generateStreamFlow(t.formattedText, GenerationConfig()).collect { result -> when (result) { is LlmStreamResult.Token -> print(result.text) is LlmStreamResult.Completed -> println("\nDone") is LlmStreamResult.Error -> println("Error: ${result.throwable}") } }}
Qualcomm AI Engine Direct rejects nGpuLayers != 0 and nCtx != 0 with PARAM_NOT_SUPPORTED — the KV cache and context length are fixed at compile time by the Qualcomm AI Hub bundle. Leave both at defaults and use max_tokens / enable_thinking only.
val paths = ModelManagerWrapper.getPaths("ai-hub-models/Qwen2.5-VL-7B-Instruct") ?: error("Model not downloaded")VlmWrapper.builder() .vlmCreateInput( VlmCreateInput( model_name = paths.model_name, model_path = paths.model_path, mmproj_path = paths.mmproj_path, config = ModelConfig(max_tokens = 2048, enable_thinking = false), runtime_id = "qairt", compute_unit = null, ) ) .build() .onSuccess { vlmWrapper = it }val msg = VlmChatMessage( role = "user", contents = listOf( VlmContent("image", "/storage/emulated/0/Pictures/cat.jpg"), VlmContent("text", "What's in this image?"), ),)val chat = arrayListOf(msg)vlmWrapper.applyChatTemplate(chat.toTypedArray(), null, false).onSuccess { t -> val gen = vlmWrapper.injectMediaPathsToConfig(chat.toTypedArray(), GenerationConfig(maxTokens = 2048)) vlmWrapper.generateStreamFlow(t.formattedText, gen).collect { result -> when (result) { is LlmStreamResult.Token -> print(result.text) is LlmStreamResult.Completed -> println("\nDone") is LlmStreamResult.Error -> println("Error: ${result.throwable}") } }}
Pass the chat-templated prompt (t.formattedText) to generateStreamFlow, never raw user text. Qualcomm AI Engine Direct VLM treats its prompt as already-templated — raw text produces degenerate output.