Skip to content

Commit 99216a5

Browse files
[Web-SDK] [Web-Example] Web SDK fixes (#433)
* Refactor audio and video infrastructure in the RunAnywhere Web SDK - Updated imports in `speak.ts`, `transcribe.ts`, `vision.ts`, and `voice.ts` to use the new core package for audio and video functionalities. - Introduced new `AudioCapture`, `AudioPlayback`, and `VideoCapture` classes to handle audio and video processing more efficiently. - Added backend-agnostic types for STT, TTS, LLM, and VLM in the core types module. - Implemented streaming capabilities for model downloads in `ModelDownloader`. - Enhanced the overall structure for better modularity and maintainability. * minor fixes * Upgrade llama.cpp to b8179 and fix WASM VLM build for new mtmd model sources - Bump LLAMACPP_VERSION from b8011 to b8179 for Vulkan/WebGPU precision fixes and FlashAttention improvements - Adapt LoRA adapter API calls in llamacpp_backend.cpp to the new llama_set_adapters_lora() batch API (replaces removed per-adapter functions) - Add nemotron-v2-vl.cpp and paddleocr.cpp to CMakeLists.txt for new clip_graph_* model types introduced in b8179 - Document Qwen2-VL CPU fallback performance (~1 tok/s) caused by M-RoPE NaN logits on WebGPU in rac_vlm_llamacpp.cpp, VLMWorkerBridge.ts, and model-manager.ts * addressed code rabbit comments * minor fixes * bumped up the version * Refactor VLM model type resolution and system prompt handling - Updated the logic for determining the effective system prompt to handle empty strings. - Introduced a new function to resolve the effective VLM model type based on options, simplifying the code in `rac_vlm_llamacpp_process`. - Improved download progress tracking in `ModelDownloader` by using cumulative byte counts for better accuracy. - Enhanced type exports in `llamacpp` and `onnx` packages for better compatibility and clarity. - Adjusted regex for identifying Qwen VL models to ensure more accurate matching. --------- Co-authored-by: Shubham Malhotra <shubham.malhotra28@gmail.com>
2 parents 0b82551 + c0a1533 commit 99216a5

45 files changed

Lines changed: 854 additions & 297 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

examples/web/RunAnywhereAI/src/services/model-manager.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ const REGISTERED_MODELS: CompactModelDef[] = [
102102
modality: ModelCategory.Multimodal,
103103
memoryRequirement: 600_000_000,
104104
},
105+
// NOTE: Qwen2-VL uses M-RoPE which produces NaN logits on WebGPU. It falls
106+
// back to CPU WASM (~1 tok/s) — noticeably slower than LFM2-VL on WebGPU.
105107
{
106108
id: 'qwen2-vl-2b-instruct-q4_k_m',
107109
name: 'Qwen2-VL 2B Instruct Q4_K_M',

examples/web/RunAnywhereAI/src/views/speak.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ const SURPRISE_TEXTS = [
2121
];
2222

2323
let ttsIsSpeaking = false;
24-
let ttsPlayback: InstanceType<
25-
typeof import('../../../../../sdk/runanywhere-web/packages/onnx/src/Infrastructure/AudioPlayback').AudioPlayback
26-
> | null = null;
24+
let ttsPlayback: import('../../../../../sdk/runanywhere-web/packages/core/src/Infrastructure/AudioPlayback').AudioPlayback | null = null;
2725

2826
// ---------------------------------------------------------------------------
2927
// Init
@@ -139,7 +137,10 @@ async function handleSpeak(): Promise<void> {
139137
statusEl.textContent = 'Synthesizing speech...';
140138
const speed = parseFloat(speedSlider.value);
141139

142-
const { TTS, AudioPlayback } = await import(
140+
const { AudioPlayback } = await import(
141+
'../../../../../sdk/runanywhere-web/packages/core/src/index'
142+
);
143+
const { TTS } = await import(
143144
'../../../../../sdk/runanywhere-web/packages/onnx/src/index'
144145
);
145146

examples/web/RunAnywhereAI/src/views/transcribe.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
*/
55

66
import type { TabLifecycle } from '../app';
7-
import { AudioCapture, VAD, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
7+
import { AudioCapture, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
8+
import { VAD } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
89
import { ModelManager, ModelCategory, ensureVADLoaded, type ModelInfo } from '../services/model-manager';
910
import { showModelSelectionSheet } from '../components/model-selection';
1011

examples/web/RunAnywhereAI/src/views/vision.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
import type { TabLifecycle } from '../app';
1313
import { ModelManager, ModelCategory, type ModelInfo } from '../services/model-manager';
1414
import { showModelSelectionSheet } from '../components/model-selection';
15-
import { VLMWorkerBridge, VideoCapture, type CapturedFrame } from '../../../../../sdk/runanywhere-web/packages/llamacpp/src/index';
15+
import { VideoCapture, type CapturedFrame } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
16+
import { VLMWorkerBridge } from '../../../../../sdk/runanywhere-web/packages/llamacpp/src/index';
1617

1718
// ---------------------------------------------------------------------------
1819
// Constants (matching iOS VLMViewModel defaults)
@@ -412,7 +413,7 @@ async function processFrame(frame: CapturedFrame, prompt: string, maxTokens: num
412413
frame.width,
413414
frame.height,
414415
prompt,
415-
{ maxTokens, temperature: 0.7 },
416+
{ maxTokens, temperature: 0.7, systemPrompt: 'You are a helpful assistant.' },
416417
);
417418

418419
// Compute metrics from JS wall clock

examples/web/RunAnywhereAI/src/views/voice.ts

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@
88
import type { TabLifecycle } from '../app';
99
import { showModelSelectionSheet } from '../components/model-selection';
1010
import { ModelManager, ModelCategory, ensureVADLoaded } from '../services/model-manager';
11-
import { VoicePipeline, PipelineState } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
12-
import {
13-
AudioCapture, AudioPlayback, VAD, SpeechActivity,
14-
} from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
11+
import { VoicePipeline, PipelineState, AudioCapture, AudioPlayback, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
12+
import { VAD } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
1513

1614
/** Shared AudioCapture instance for this view (replaces app-level MicCapture singleton). */
1715
const micCapture = new AudioCapture();

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ bool LlamaCppTextGeneration::unload_model_internal() {
370370

371371
// Clear LoRA adapters from context before freeing
372372
// (adapter memory is freed automatically with the model per llama.cpp API)
373+
// Best-effort during teardown: log but don't fail unload on error.
373374
if (context_ && !lora_adapters_.empty()) {
374375
llama_set_adapters_lora(context_, nullptr, 0, nullptr);
375376
}
@@ -829,7 +830,6 @@ bool LlamaCppTextGeneration::recreate_context() {
829830

830831
bool LlamaCppTextGeneration::apply_lora_adapters() {
831832
if (lora_adapters_.empty()) {
832-
// Clear all adapters from context
833833
llama_set_adapters_lora(context_, nullptr, 0, nullptr);
834834
return true;
835835
}
@@ -930,8 +930,6 @@ bool LlamaCppTextGeneration::remove_lora_adapter(const std::string& adapter_path
930930
return false;
931931
}
932932

933-
// Remove from tracking (adapter memory is freed automatically with the model
934-
// per llama.cpp API — llama_adapter_lora_free is deprecated since b8011)
935933
lora_adapters_.erase(it);
936934

937935
// Re-apply remaining adapters (or clear if none left)

0 commit comments

Comments
 (0)