RunanywhereAI
diff --git a/‎examples/web/RunAnywhereAI/src/services/model-manager.ts‎
Lines changed: 2 additions & 0 deletions b/‎examples/web/RunAnywhereAI/src/services/model-manager.ts‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/web/RunAnywhereAI/src/views/speak.ts‎
Lines changed: 5 additions & 4 deletions b/‎examples/web/RunAnywhereAI/src/views/speak.ts‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/web/RunAnywhereAI/src/views/transcribe.ts‎
Lines changed: 2 additions & 1 deletion b/‎examples/web/RunAnywhereAI/src/views/transcribe.ts‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/web/RunAnywhereAI/src/views/vision.ts‎
Lines changed: 3 additions & 2 deletions b/‎examples/web/RunAnywhereAI/src/views/vision.ts‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/web/RunAnywhereAI/src/views/voice.ts‎
Lines changed: 2 additions & 4 deletions b/‎examples/web/RunAnywhereAI/src/views/voice.ts‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp‎
Lines changed: 1 addition & 3 deletions b/‎sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp‎
Lines changed: 1 addition & 3 deletions
@@ -102,6 +102,8 @@ const REGISTERED_MODELS: CompactModelDef[] = [
     modality: ModelCategory.Multimodal,
     memoryRequirement: 600_000_000,
   },
+  // NOTE: Qwen2-VL uses M-RoPE which produces NaN logits on WebGPU. It falls
+  // back to CPU WASM (~1 tok/s) — noticeably slower than LFM2-VL on WebGPU.
   {
     id: 'qwen2-vl-2b-instruct-q4_k_m',
     name: 'Qwen2-VL 2B Instruct Q4_K_M',
 
@@ -21,9 +21,7 @@ const SURPRISE_TEXTS = [
 ];
 
 let ttsIsSpeaking = false;
-let ttsPlayback: InstanceType<
-  typeof import('../../../../../sdk/runanywhere-web/packages/onnx/src/Infrastructure/AudioPlayback').AudioPlayback
-> | null = null;
+let ttsPlayback: import('../../../../../sdk/runanywhere-web/packages/core/src/Infrastructure/AudioPlayback').AudioPlayback | null = null;
 
 // ---------------------------------------------------------------------------
 // Init
@@ -139,7 +137,10 @@ async function handleSpeak(): Promise<void> {
     statusEl.textContent = 'Synthesizing speech...';
     const speed = parseFloat(speedSlider.value);
 
-    const { TTS, AudioPlayback } = await import(
+    const { AudioPlayback } = await import(
+      '../../../../../sdk/runanywhere-web/packages/core/src/index'
+    );
+    const { TTS } = await import(
       '../../../../../sdk/runanywhere-web/packages/onnx/src/index'
     );
 
 
@@ -4,7 +4,8 @@
  */
 
 import type { TabLifecycle } from '../app';
-import { AudioCapture, VAD, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
+import { AudioCapture, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
+import { VAD } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
 import { ModelManager, ModelCategory, ensureVADLoaded, type ModelInfo } from '../services/model-manager';
 import { showModelSelectionSheet } from '../components/model-selection';
 
 
@@ -12,7 +12,8 @@
 import type { TabLifecycle } from '../app';
 import { ModelManager, ModelCategory, type ModelInfo } from '../services/model-manager';
 import { showModelSelectionSheet } from '../components/model-selection';
-import { VLMWorkerBridge, VideoCapture, type CapturedFrame } from '../../../../../sdk/runanywhere-web/packages/llamacpp/src/index';
+import { VideoCapture, type CapturedFrame } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
+import { VLMWorkerBridge } from '../../../../../sdk/runanywhere-web/packages/llamacpp/src/index';
 
 // ---------------------------------------------------------------------------
 // Constants (matching iOS VLMViewModel defaults)
@@ -412,7 +413,7 @@ async function processFrame(frame: CapturedFrame, prompt: string, maxTokens: num
       frame.width,
       frame.height,
       prompt,
-      { maxTokens, temperature: 0.7 },
+      { maxTokens, temperature: 0.7, systemPrompt: 'You are a helpful assistant.' },
     );
 
     // Compute metrics from JS wall clock
 
@@ -8,10 +8,8 @@
 import type { TabLifecycle } from '../app';
 import { showModelSelectionSheet } from '../components/model-selection';
 import { ModelManager, ModelCategory, ensureVADLoaded } from '../services/model-manager';
-import { VoicePipeline, PipelineState } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
-import {
-  AudioCapture, AudioPlayback, VAD, SpeechActivity,
-} from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
+import { VoicePipeline, PipelineState, AudioCapture, AudioPlayback, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
+import { VAD } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
 
 /** Shared AudioCapture instance for this view (replaces app-level MicCapture singleton). */
 const micCapture = new AudioCapture();
 
@@ -370,6 +370,7 @@ bool LlamaCppTextGeneration::unload_model_internal() {
 
     // Clear LoRA adapters from context before freeing
     // (adapter memory is freed automatically with the model per llama.cpp API)
+    // Best-effort during teardown: log but don't fail unload on error.
     if (context_ && !lora_adapters_.empty()) {
         llama_set_adapters_lora(context_, nullptr, 0, nullptr);
     }
@@ -829,7 +830,6 @@ bool LlamaCppTextGeneration::recreate_context() {
 
 bool LlamaCppTextGeneration::apply_lora_adapters() {
     if (lora_adapters_.empty()) {
-        // Clear all adapters from context
         llama_set_adapters_lora(context_, nullptr, 0, nullptr);
         return true;
     }
@@ -930,8 +930,6 @@ bool LlamaCppTextGeneration::remove_lora_adapter(const std::string& adapter_path
         return false;
     }
 
-    // Remove from tracking (adapter memory is freed automatically with the model
-    // per llama.cpp API — llama_adapter_lora_free is deprecated since b8011)
     lora_adapters_.erase(it);
 
     // Re-apply remaining adapters (or clear if none left)
Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,8 @@ const REGISTERED_MODELS: CompactModelDef[] = [`
`102`	`102`	`modality: ModelCategory.Multimodal,`
`103`	`103`	`memoryRequirement: 600_000_000,`
`104`	`104`	`},`
	`105`	`+ // NOTE: Qwen2-VL uses M-RoPE which produces NaN logits on WebGPU. It falls`
	`106`	`+ // back to CPU WASM (~1 tok/s) — noticeably slower than LFM2-VL on WebGPU.`
`105`	`107`	`{`
`106`	`108`	`id: 'qwen2-vl-2b-instruct-q4_k_m',`
`107`	`109`	`name: 'Qwen2-VL 2B Instruct Q4_K_M',`
Original file line number	Diff line number	Diff line change
`@@ -370,6 +370,7 @@ bool LlamaCppTextGeneration::unload_model_internal() {`
`370`	`370`
`371`	`371`	`// Clear LoRA adapters from context before freeing`
`372`	`372`	`// (adapter memory is freed automatically with the model per llama.cpp API)`
	`373`	`+ // Best-effort during teardown: log but don't fail unload on error.`
`373`	`374`	`if (context_ && !lora_adapters_.empty()) {`
`374`	`375`	`llama_set_adapters_lora(context_, nullptr, 0, nullptr);`
`375`	`376`	`}`
`@@ -829,7 +830,6 @@ bool LlamaCppTextGeneration::recreate_context() {`
`829`	`830`
`830`	`831`	`bool LlamaCppTextGeneration::apply_lora_adapters() {`
`831`	`832`	`if (lora_adapters_.empty()) {`
`832`		`- // Clear all adapters from context`
`833`	`833`	`llama_set_adapters_lora(context_, nullptr, 0, nullptr);`
`834`	`834`	`return true;`
`835`	`835`	`}`
`@@ -930,8 +930,6 @@ bool LlamaCppTextGeneration::remove_lora_adapter(const std::string& adapter_path`
`930`	`930`	`return false;`
`931`	`931`	`}`
`932`	`932`
`933`		`- // Remove from tracking (adapter memory is freed automatically with the model`
`934`		`- // per llama.cpp API — llama_adapter_lora_free is deprecated since b8011)`
`935`	`933`	`lora_adapters_.erase(it);`
`936`	`934`
`937`	`935`	`// Re-apply remaining adapters (or clear if none left)`