RunanywhereAI
diff --git a/‎examples/android/RunAnywhereAI/app/src/main/java/com/runanywhere/runanywhereai/data/ModelList.kt‎
Lines changed: 30 additions & 0 deletions b/‎examples/android/RunAnywhereAI/app/src/main/java/com/runanywhere/runanywhereai/data/ModelList.kt‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift‎
Lines changed: 58 additions & 0 deletions b/‎examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎examples/web/RunAnywhereAI/src/services/model-manager.ts‎
Lines changed: 2 additions & 0 deletions b/‎examples/web/RunAnywhereAI/src/services/model-manager.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/web/RunAnywhereAI/src/views/speak.ts‎
Lines changed: 5 additions & 4 deletions b/‎examples/web/RunAnywhereAI/src/views/speak.ts‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/web/RunAnywhereAI/src/views/transcribe.ts‎
Lines changed: 2 additions & 1 deletion b/‎examples/web/RunAnywhereAI/src/views/transcribe.ts‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/web/RunAnywhereAI/src/views/vision.ts‎
Lines changed: 3 additions & 2 deletions b/‎examples/web/RunAnywhereAI/src/views/vision.ts‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/web/RunAnywhereAI/src/views/voice.ts‎
Lines changed: 2 additions & 4 deletions b/‎examples/web/RunAnywhereAI/src/views/voice.ts‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎sdk/runanywhere-commons/VERSIONS‎
Lines changed: 2 additions & 2 deletions b/‎sdk/runanywhere-commons/VERSIONS‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/runanywhere-commons/src/backends/llamacpp/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎sdk/runanywhere-commons/src/backends/llamacpp/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp‎
Lines changed: 30 additions & 14 deletions b/‎sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp‎
Lines changed: 30 additions & 14 deletions
@@ -33,6 +33,36 @@ object ModelList {
             url = "https://huggingface.co/Triangle104/Qwen2.5-0.5B-Instruct-Q6_K-GGUF/resolve/main/qwen2.5-0.5b-instruct-q6_k.gguf",
             framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
             memoryRequirement = 600_000_000, supportsLoraAdapters = true),
+        AppModel(id = "qwen2.5-1.5b-instruct-q4_k_m", name = "Qwen 2.5 1.5B Instruct Q4_K_M",
+            url = "https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf",
+            framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
+            memoryRequirement = 2_500_000_000),
+        // Qwen3 models
+        AppModel(id = "qwen3-0.6b-q4_k_m", name = "Qwen3 0.6B Q4_K_M",
+            url = "https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf",
+            framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
+            memoryRequirement = 500_000_000),
+        AppModel(id = "qwen3-1.7b-q4_k_m", name = "Qwen3 1.7B Q4_K_M",
+            url = "https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q4_K_M.gguf",
+            framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
+            memoryRequirement = 1_200_000_000),
+        AppModel(id = "qwen3-4b-q4_k_m", name = "Qwen3 4B Q4_K_M",
+            url = "https://huggingface.co/unsloth/Qwen3-4B-GGUF/resolve/main/Qwen3-4B-Q4_K_M.gguf",
+            framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
+            memoryRequirement = 2_800_000_000),
+        // Qwen3.5 models
+        AppModel(id = "qwen3.5-0.8b-q4_k_m", name = "Qwen3.5 0.8B Q4_K_M",
+            url = "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q4_K_M.gguf",
+            framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
+            memoryRequirement = 600_000_000),
+        AppModel(id = "qwen3.5-2b-q4_k_m", name = "Qwen3.5 2B Q4_K_M",
+            url = "https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf",
+            framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
+            memoryRequirement = 1_500_000_000),
+        AppModel(id = "qwen3.5-4b-q4_k_m", name = "Qwen3.5 4B Q4_K_M",
+            url = "https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/Qwen3.5-4B-Q4_K_M.gguf",
+            framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
+            memoryRequirement = 2_800_000_000),
         AppModel(id = "lfm2-350m-q4_k_m", name = "LiquidAI LFM2 350M Q4_K_M",
             url = "https://huggingface.co/LiquidAI/LFM2-350M-GGUF/resolve/main/LFM2-350M-Q4_K_M.gguf",
             framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
 
@@ -267,6 +267,64 @@ struct RunAnywhereAIApp: App {
             )
         }
 
+        // Qwen3 models
+        if let qwen3_06bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf") {
+            RunAnywhere.registerModel(
+                id: "qwen3-0.6b-q4_k_m",
+                name: "Qwen3 0.6B Q4_K_M",
+                url: qwen3_06bURL,
+                framework: .llamaCpp,
+                memoryRequirement: 500_000_000
+            )
+        }
+        if let qwen3_17bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q4_K_M.gguf") {
+            RunAnywhere.registerModel(
+                id: "qwen3-1.7b-q4_k_m",
+                name: "Qwen3 1.7B Q4_K_M",
+                url: qwen3_17bURL,
+                framework: .llamaCpp,
+                memoryRequirement: 1_200_000_000
+            )
+        }
+        if let qwen3_4bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-4B-GGUF/resolve/main/Qwen3-4B-Q4_K_M.gguf") {
+            RunAnywhere.registerModel(
+                id: "qwen3-4b-q4_k_m",
+                name: "Qwen3 4B Q4_K_M",
+                url: qwen3_4bURL,
+                framework: .llamaCpp,
+                memoryRequirement: 2_800_000_000
+            )
+        }
+
+        // Qwen3.5 models
+        if let qwen35_08bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q4_K_M.gguf") {
+            RunAnywhere.registerModel(
+                id: "qwen3.5-0.8b-q4_k_m",
+                name: "Qwen3.5 0.8B Q4_K_M",
+                url: qwen35_08bURL,
+                framework: .llamaCpp,
+                memoryRequirement: 600_000_000
+            )
+        }
+        if let qwen35_2bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf") {
+            RunAnywhere.registerModel(
+                id: "qwen3.5-2b-q4_k_m",
+                name: "Qwen3.5 2B Q4_K_M",
+                url: qwen35_2bURL,
+                framework: .llamaCpp,
+                memoryRequirement: 1_500_000_000
+            )
+        }
+        if let qwen35_4bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/Qwen3.5-4B-Q4_K_M.gguf") {
+            RunAnywhere.registerModel(
+                id: "qwen3.5-4b-q4_k_m",
+                name: "Qwen3.5 4B Q4_K_M",
+                url: qwen35_4bURL,
+                framework: .llamaCpp,
+                memoryRequirement: 2_800_000_000
+            )
+        }
+
         logger.info("✅ LLM models registered (including tool-calling optimized models)")
 
         // Register VLM (Vision Language) models
 
@@ -102,6 +102,8 @@ const REGISTERED_MODELS: CompactModelDef[] = [
     modality: ModelCategory.Multimodal,
     memoryRequirement: 600_000_000,
   },
+  // NOTE: Qwen2-VL uses M-RoPE which produces NaN logits on WebGPU. It falls
+  // back to CPU WASM (~1 tok/s) — noticeably slower than LFM2-VL on WebGPU.
   {
     id: 'qwen2-vl-2b-instruct-q4_k_m',
     name: 'Qwen2-VL 2B Instruct Q4_K_M',
 
@@ -21,9 +21,7 @@ const SURPRISE_TEXTS = [
 ];
 
 let ttsIsSpeaking = false;
-let ttsPlayback: InstanceType<
-  typeof import('../../../../../sdk/runanywhere-web/packages/onnx/src/Infrastructure/AudioPlayback').AudioPlayback
-> | null = null;
+let ttsPlayback: import('../../../../../sdk/runanywhere-web/packages/core/src/Infrastructure/AudioPlayback').AudioPlayback | null = null;
 
 // ---------------------------------------------------------------------------
 // Init
@@ -139,7 +137,10 @@ async function handleSpeak(): Promise<void> {
     statusEl.textContent = 'Synthesizing speech...';
     const speed = parseFloat(speedSlider.value);
 
-    const { TTS, AudioPlayback } = await import(
+    const { AudioPlayback } = await import(
+      '../../../../../sdk/runanywhere-web/packages/core/src/index'
+    );
+    const { TTS } = await import(
       '../../../../../sdk/runanywhere-web/packages/onnx/src/index'
     );
 
 
@@ -4,7 +4,8 @@
  */
 
 import type { TabLifecycle } from '../app';
-import { AudioCapture, VAD, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
+import { AudioCapture, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
+import { VAD } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
 import { ModelManager, ModelCategory, ensureVADLoaded, type ModelInfo } from '../services/model-manager';
 import { showModelSelectionSheet } from '../components/model-selection';
 
 
@@ -12,7 +12,8 @@
 import type { TabLifecycle } from '../app';
 import { ModelManager, ModelCategory, type ModelInfo } from '../services/model-manager';
 import { showModelSelectionSheet } from '../components/model-selection';
-import { VLMWorkerBridge, VideoCapture, type CapturedFrame } from '../../../../../sdk/runanywhere-web/packages/llamacpp/src/index';
+import { VideoCapture, type CapturedFrame } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
+import { VLMWorkerBridge } from '../../../../../sdk/runanywhere-web/packages/llamacpp/src/index';
 
 // ---------------------------------------------------------------------------
 // Constants (matching iOS VLMViewModel defaults)
@@ -412,7 +413,7 @@ async function processFrame(frame: CapturedFrame, prompt: string, maxTokens: num
       frame.width,
       frame.height,
       prompt,
-      { maxTokens, temperature: 0.7 },
+      { maxTokens, temperature: 0.7, systemPrompt: 'You are a helpful assistant.' },
     );
 
     // Compute metrics from JS wall clock
 
@@ -8,10 +8,8 @@
 import type { TabLifecycle } from '../app';
 import { showModelSelectionSheet } from '../components/model-selection';
 import { ModelManager, ModelCategory, ensureVADLoaded } from '../services/model-manager';
-import { VoicePipeline, PipelineState } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
-import {
-  AudioCapture, AudioPlayback, VAD, SpeechActivity,
-} from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
+import { VoicePipeline, PipelineState, AudioCapture, AudioPlayback, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
+import { VAD } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
 
 /** Shared AudioCapture instance for this view (replaces app-level MicCapture singleton). */
 const micCapture = new AudioCapture();
 
@@ -72,9 +72,9 @@ SHERPA_ONNX_VERSION_LINUX=1.12.23
 # =============================================================================
 # llama.cpp (LLM inference)
 # =============================================================================
-# b8011 - latest stable release (Feb 2026), includes GGML_WEBGPU backend
+# b8201 - latest stable release (Feb 2026), includes GGML_WEBGPU backend
 # NOTE: Bumped from b7650 to enable WebGPU acceleration for WASM builds
-LLAMACPP_VERSION=b8011
+LLAMACPP_VERSION=b8201
 
 # =============================================================================
 # nlohmann/json
 
@@ -153,6 +153,8 @@ if(RAC_VLM_USE_MTMD)
         ${llamacpp_SOURCE_DIR}/tools/mtmd/models/whisper-enc.cpp
         ${llamacpp_SOURCE_DIR}/tools/mtmd/models/kimik25.cpp
         ${llamacpp_SOURCE_DIR}/tools/mtmd/models/mobilenetv5.cpp
+        ${llamacpp_SOURCE_DIR}/tools/mtmd/models/paddleocr.cpp
+        ${llamacpp_SOURCE_DIR}/tools/mtmd/models/nemotron-v2-vl.cpp
     )
 endif()
 
 
@@ -380,8 +380,9 @@ bool LlamaCppTextGeneration::unload_model_internal() {
 
     // Clear LoRA adapters from context before freeing
     // (adapter memory is freed automatically with the model per llama.cpp API)
+    // Best-effort during teardown: log but don't fail unload on error.
     if (context_ && !lora_adapters_.empty()) {
-        llama_clear_adapter_lora(context_);
+        llama_set_adapters_lora(context_, nullptr, 0, nullptr);
     }
     lora_adapters_.clear();
 
@@ -1162,13 +1163,31 @@ bool LlamaCppTextGeneration::recreate_context() {
 }
 
 bool LlamaCppTextGeneration::apply_lora_adapters() {
+    if (lora_adapters_.empty()) {
+        llama_set_adapters_lora(context_, nullptr, 0, nullptr);
+        return true;
+    }
+
+    std::vector<llama_adapter_lora*> adapters;
+    std::vector<float> scales;
+    adapters.reserve(lora_adapters_.size());
+    scales.reserve(lora_adapters_.size());
+
     for (auto& entry : lora_adapters_) {
-        int32_t result = llama_set_adapter_lora(context_, entry.adapter, entry.scale);
-        if (result != 0) {
-            LOGE("Failed to apply LoRA adapter: %s (error=%d)", entry.path.c_str(), result);
+        adapters.push_back(entry.adapter);
+        scales.push_back(entry.scale);
+    }
+
+    int32_t result = llama_set_adapters_lora(context_, adapters.data(), adapters.size(), scales.data());
+    if (result != 0) {
+        LOGE("Failed to apply LoRA adapters (error=%d)", result);
+        for (auto& entry : lora_adapters_) {
             entry.applied = false;
-            return false;
         }
+        return false;
+    }
+
+    for (auto& entry : lora_adapters_) {
         entry.applied = true;
         LOGI("Applied LoRA adapter: %s (scale=%.2f)", entry.path.c_str(), entry.scale);
     }
@@ -1245,17 +1264,14 @@ bool LlamaCppTextGeneration::remove_lora_adapter(const std::string& adapter_path
         return false;
     }
 
-    // Remove from context
-    int32_t result = llama_rm_adapter_lora(context_, it->adapter);
-    if (result != 0) {
-        LOGE("Failed to remove LoRA adapter from context: %s (error=%d)", adapter_path.c_str(), result);
+    lora_adapters_.erase(it);
+
+    // Re-apply remaining adapters (or clear if none left)
+    if (!apply_lora_adapters()) {
+        LOGE("Failed to re-apply remaining LoRA adapters after removal");
         return false;
     }
 
-    // Remove from tracking (adapter memory is freed automatically with the model
-    // per llama.cpp API — llama_adapter_lora_free is deprecated since b8011)
-    lora_adapters_.erase(it);
-
     // Clear KV cache after adapter changes
     llama_memory_clear(llama_get_memory(context_), true);
 
@@ -1271,7 +1287,7 @@ void LlamaCppTextGeneration::clear_lora_adapters() {
     }
 
     if (context_) {
-        llama_clear_adapter_lora(context_);
+        llama_set_adapters_lora(context_, nullptr, 0, nullptr);
         llama_memory_clear(llama_get_memory(context_), true);
     }
Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,8 @@ const REGISTERED_MODELS: CompactModelDef[] = [`
`102`	`102`	`modality: ModelCategory.Multimodal,`
`103`	`103`	`memoryRequirement: 600_000_000,`
`104`	`104`	`},`
	`105`	`+ // NOTE: Qwen2-VL uses M-RoPE which produces NaN logits on WebGPU. It falls`
	`106`	`+ // back to CPU WASM (~1 tok/s) — noticeably slower than LFM2-VL on WebGPU.`
`105`	`107`	`{`
`106`	`108`	`id: 'qwen2-vl-2b-instruct-q4_k_m',`
`107`	`109`	`name: 'Qwen2-VL 2B Instruct Q4_K_M',`
Original file line number	Diff line number	Diff line change
`@@ -153,6 +153,8 @@ if(RAC_VLM_USE_MTMD)`
`153`	`153`	`${llamacpp_SOURCE_DIR}/tools/mtmd/models/whisper-enc.cpp`
`154`	`154`	`${llamacpp_SOURCE_DIR}/tools/mtmd/models/kimik25.cpp`
`155`	`155`	`${llamacpp_SOURCE_DIR}/tools/mtmd/models/mobilenetv5.cpp`
	`156`	`+ ${llamacpp_SOURCE_DIR}/tools/mtmd/models/paddleocr.cpp`
	`157`	`+ ${llamacpp_SOURCE_DIR}/tools/mtmd/models/nemotron-v2-vl.cpp`
`156`	`158`	`)`
`157`	`159`	`endif()`
`158`	`160`