Skip to content

Commit 7062e5a

Browse files
Merge remote-tracking branch 'origin/main' into shubham-rag-fix
Made-with: Cursor # Conflicts: # sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
2 parents 6378d6b + 99216a5 commit 7062e5a

50 files changed

Lines changed: 976 additions & 311 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

examples/android/RunAnywhereAI/app/src/main/java/com/runanywhere/runanywhereai/data/ModelList.kt

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,36 @@ object ModelList {
3333
url = "https://huggingface.co/Triangle104/Qwen2.5-0.5B-Instruct-Q6_K-GGUF/resolve/main/qwen2.5-0.5b-instruct-q6_k.gguf",
3434
framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
3535
memoryRequirement = 600_000_000, supportsLoraAdapters = true),
36+
AppModel(id = "qwen2.5-1.5b-instruct-q4_k_m", name = "Qwen 2.5 1.5B Instruct Q4_K_M",
37+
url = "https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf",
38+
framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
39+
memoryRequirement = 2_500_000_000),
40+
// Qwen3 models
41+
AppModel(id = "qwen3-0.6b-q4_k_m", name = "Qwen3 0.6B Q4_K_M",
42+
url = "https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf",
43+
framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
44+
memoryRequirement = 500_000_000),
45+
AppModel(id = "qwen3-1.7b-q4_k_m", name = "Qwen3 1.7B Q4_K_M",
46+
url = "https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q4_K_M.gguf",
47+
framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
48+
memoryRequirement = 1_200_000_000),
49+
AppModel(id = "qwen3-4b-q4_k_m", name = "Qwen3 4B Q4_K_M",
50+
url = "https://huggingface.co/unsloth/Qwen3-4B-GGUF/resolve/main/Qwen3-4B-Q4_K_M.gguf",
51+
framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
52+
memoryRequirement = 2_800_000_000),
53+
// Qwen3.5 models
54+
AppModel(id = "qwen3.5-0.8b-q4_k_m", name = "Qwen3.5 0.8B Q4_K_M",
55+
url = "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q4_K_M.gguf",
56+
framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
57+
memoryRequirement = 600_000_000),
58+
AppModel(id = "qwen3.5-2b-q4_k_m", name = "Qwen3.5 2B Q4_K_M",
59+
url = "https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf",
60+
framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
61+
memoryRequirement = 1_500_000_000),
62+
AppModel(id = "qwen3.5-4b-q4_k_m", name = "Qwen3.5 4B Q4_K_M",
63+
url = "https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/Qwen3.5-4B-Q4_K_M.gguf",
64+
framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,
65+
memoryRequirement = 2_800_000_000),
3666
AppModel(id = "lfm2-350m-q4_k_m", name = "LiquidAI LFM2 350M Q4_K_M",
3767
url = "https://huggingface.co/LiquidAI/LFM2-350M-GGUF/resolve/main/LFM2-350M-Q4_K_M.gguf",
3868
framework = InferenceFramework.LLAMA_CPP, category = ModelCategory.LANGUAGE,

examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,64 @@ struct RunAnywhereAIApp: App {
267267
)
268268
}
269269

270+
// Qwen3 models
271+
if let qwen3_06bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf") {
272+
RunAnywhere.registerModel(
273+
id: "qwen3-0.6b-q4_k_m",
274+
name: "Qwen3 0.6B Q4_K_M",
275+
url: qwen3_06bURL,
276+
framework: .llamaCpp,
277+
memoryRequirement: 500_000_000
278+
)
279+
}
280+
if let qwen3_17bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q4_K_M.gguf") {
281+
RunAnywhere.registerModel(
282+
id: "qwen3-1.7b-q4_k_m",
283+
name: "Qwen3 1.7B Q4_K_M",
284+
url: qwen3_17bURL,
285+
framework: .llamaCpp,
286+
memoryRequirement: 1_200_000_000
287+
)
288+
}
289+
if let qwen3_4bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-4B-GGUF/resolve/main/Qwen3-4B-Q4_K_M.gguf") {
290+
RunAnywhere.registerModel(
291+
id: "qwen3-4b-q4_k_m",
292+
name: "Qwen3 4B Q4_K_M",
293+
url: qwen3_4bURL,
294+
framework: .llamaCpp,
295+
memoryRequirement: 2_800_000_000
296+
)
297+
}
298+
299+
// Qwen3.5 models
300+
if let qwen35_08bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q4_K_M.gguf") {
301+
RunAnywhere.registerModel(
302+
id: "qwen3.5-0.8b-q4_k_m",
303+
name: "Qwen3.5 0.8B Q4_K_M",
304+
url: qwen35_08bURL,
305+
framework: .llamaCpp,
306+
memoryRequirement: 600_000_000
307+
)
308+
}
309+
if let qwen35_2bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf") {
310+
RunAnywhere.registerModel(
311+
id: "qwen3.5-2b-q4_k_m",
312+
name: "Qwen3.5 2B Q4_K_M",
313+
url: qwen35_2bURL,
314+
framework: .llamaCpp,
315+
memoryRequirement: 1_500_000_000
316+
)
317+
}
318+
if let qwen35_4bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/Qwen3.5-4B-Q4_K_M.gguf") {
319+
RunAnywhere.registerModel(
320+
id: "qwen3.5-4b-q4_k_m",
321+
name: "Qwen3.5 4B Q4_K_M",
322+
url: qwen35_4bURL,
323+
framework: .llamaCpp,
324+
memoryRequirement: 2_800_000_000
325+
)
326+
}
327+
270328
logger.info("✅ LLM models registered (including tool-calling optimized models)")
271329

272330
// Register VLM (Vision Language) models

examples/web/RunAnywhereAI/src/services/model-manager.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ const REGISTERED_MODELS: CompactModelDef[] = [
102102
modality: ModelCategory.Multimodal,
103103
memoryRequirement: 600_000_000,
104104
},
105+
// NOTE: Qwen2-VL uses M-RoPE which produces NaN logits on WebGPU. It falls
106+
// back to CPU WASM (~1 tok/s) — noticeably slower than LFM2-VL on WebGPU.
105107
{
106108
id: 'qwen2-vl-2b-instruct-q4_k_m',
107109
name: 'Qwen2-VL 2B Instruct Q4_K_M',

examples/web/RunAnywhereAI/src/views/speak.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ const SURPRISE_TEXTS = [
2121
];
2222

2323
let ttsIsSpeaking = false;
24-
let ttsPlayback: InstanceType<
25-
typeof import('../../../../../sdk/runanywhere-web/packages/onnx/src/Infrastructure/AudioPlayback').AudioPlayback
26-
> | null = null;
24+
let ttsPlayback: import('../../../../../sdk/runanywhere-web/packages/core/src/Infrastructure/AudioPlayback').AudioPlayback | null = null;
2725

2826
// ---------------------------------------------------------------------------
2927
// Init
@@ -139,7 +137,10 @@ async function handleSpeak(): Promise<void> {
139137
statusEl.textContent = 'Synthesizing speech...';
140138
const speed = parseFloat(speedSlider.value);
141139

142-
const { TTS, AudioPlayback } = await import(
140+
const { AudioPlayback } = await import(
141+
'../../../../../sdk/runanywhere-web/packages/core/src/index'
142+
);
143+
const { TTS } = await import(
143144
'../../../../../sdk/runanywhere-web/packages/onnx/src/index'
144145
);
145146

examples/web/RunAnywhereAI/src/views/transcribe.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
*/
55

66
import type { TabLifecycle } from '../app';
7-
import { AudioCapture, VAD, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
7+
import { AudioCapture, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
8+
import { VAD } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
89
import { ModelManager, ModelCategory, ensureVADLoaded, type ModelInfo } from '../services/model-manager';
910
import { showModelSelectionSheet } from '../components/model-selection';
1011

examples/web/RunAnywhereAI/src/views/vision.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
import type { TabLifecycle } from '../app';
1313
import { ModelManager, ModelCategory, type ModelInfo } from '../services/model-manager';
1414
import { showModelSelectionSheet } from '../components/model-selection';
15-
import { VLMWorkerBridge, VideoCapture, type CapturedFrame } from '../../../../../sdk/runanywhere-web/packages/llamacpp/src/index';
15+
import { VideoCapture, type CapturedFrame } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
16+
import { VLMWorkerBridge } from '../../../../../sdk/runanywhere-web/packages/llamacpp/src/index';
1617

1718
// ---------------------------------------------------------------------------
1819
// Constants (matching iOS VLMViewModel defaults)
@@ -412,7 +413,7 @@ async function processFrame(frame: CapturedFrame, prompt: string, maxTokens: num
412413
frame.width,
413414
frame.height,
414415
prompt,
415-
{ maxTokens, temperature: 0.7 },
416+
{ maxTokens, temperature: 0.7, systemPrompt: 'You are a helpful assistant.' },
416417
);
417418

418419
// Compute metrics from JS wall clock

examples/web/RunAnywhereAI/src/views/voice.ts

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@
88
import type { TabLifecycle } from '../app';
99
import { showModelSelectionSheet } from '../components/model-selection';
1010
import { ModelManager, ModelCategory, ensureVADLoaded } from '../services/model-manager';
11-
import { VoicePipeline, PipelineState } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
12-
import {
13-
AudioCapture, AudioPlayback, VAD, SpeechActivity,
14-
} from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
11+
import { VoicePipeline, PipelineState, AudioCapture, AudioPlayback, SpeechActivity } from '../../../../../sdk/runanywhere-web/packages/core/src/index';
12+
import { VAD } from '../../../../../sdk/runanywhere-web/packages/onnx/src/index';
1513

1614
/** Shared AudioCapture instance for this view (replaces app-level MicCapture singleton). */
1715
const micCapture = new AudioCapture();

sdk/runanywhere-commons/VERSIONS

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ SHERPA_ONNX_VERSION_LINUX=1.12.23
7272
# =============================================================================
7373
# llama.cpp (LLM inference)
7474
# =============================================================================
75-
# b8011 - latest stable release (Feb 2026), includes GGML_WEBGPU backend
75+
# b8201 - latest stable release (Feb 2026), includes GGML_WEBGPU backend
7676
# NOTE: Bumped from b7650 to enable WebGPU acceleration for WASM builds
77-
LLAMACPP_VERSION=b8011
77+
LLAMACPP_VERSION=b8201
7878

7979
# =============================================================================
8080
# nlohmann/json

sdk/runanywhere-commons/src/backends/llamacpp/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ if(RAC_VLM_USE_MTMD)
153153
${llamacpp_SOURCE_DIR}/tools/mtmd/models/whisper-enc.cpp
154154
${llamacpp_SOURCE_DIR}/tools/mtmd/models/kimik25.cpp
155155
${llamacpp_SOURCE_DIR}/tools/mtmd/models/mobilenetv5.cpp
156+
${llamacpp_SOURCE_DIR}/tools/mtmd/models/paddleocr.cpp
157+
${llamacpp_SOURCE_DIR}/tools/mtmd/models/nemotron-v2-vl.cpp
156158
)
157159
endif()
158160

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -380,8 +380,9 @@ bool LlamaCppTextGeneration::unload_model_internal() {
380380

381381
// Clear LoRA adapters from context before freeing
382382
// (adapter memory is freed automatically with the model per llama.cpp API)
383+
// Best-effort during teardown: log but don't fail unload on error.
383384
if (context_ && !lora_adapters_.empty()) {
384-
llama_clear_adapter_lora(context_);
385+
llama_set_adapters_lora(context_, nullptr, 0, nullptr);
385386
}
386387
lora_adapters_.clear();
387388

@@ -1162,13 +1163,31 @@ bool LlamaCppTextGeneration::recreate_context() {
11621163
}
11631164

11641165
bool LlamaCppTextGeneration::apply_lora_adapters() {
1166+
if (lora_adapters_.empty()) {
1167+
llama_set_adapters_lora(context_, nullptr, 0, nullptr);
1168+
return true;
1169+
}
1170+
1171+
std::vector<llama_adapter_lora*> adapters;
1172+
std::vector<float> scales;
1173+
adapters.reserve(lora_adapters_.size());
1174+
scales.reserve(lora_adapters_.size());
1175+
11651176
for (auto& entry : lora_adapters_) {
1166-
int32_t result = llama_set_adapter_lora(context_, entry.adapter, entry.scale);
1167-
if (result != 0) {
1168-
LOGE("Failed to apply LoRA adapter: %s (error=%d)", entry.path.c_str(), result);
1177+
adapters.push_back(entry.adapter);
1178+
scales.push_back(entry.scale);
1179+
}
1180+
1181+
int32_t result = llama_set_adapters_lora(context_, adapters.data(), adapters.size(), scales.data());
1182+
if (result != 0) {
1183+
LOGE("Failed to apply LoRA adapters (error=%d)", result);
1184+
for (auto& entry : lora_adapters_) {
11691185
entry.applied = false;
1170-
return false;
11711186
}
1187+
return false;
1188+
}
1189+
1190+
for (auto& entry : lora_adapters_) {
11721191
entry.applied = true;
11731192
LOGI("Applied LoRA adapter: %s (scale=%.2f)", entry.path.c_str(), entry.scale);
11741193
}
@@ -1245,17 +1264,14 @@ bool LlamaCppTextGeneration::remove_lora_adapter(const std::string& adapter_path
12451264
return false;
12461265
}
12471266

1248-
// Remove from context
1249-
int32_t result = llama_rm_adapter_lora(context_, it->adapter);
1250-
if (result != 0) {
1251-
LOGE("Failed to remove LoRA adapter from context: %s (error=%d)", adapter_path.c_str(), result);
1267+
lora_adapters_.erase(it);
1268+
1269+
// Re-apply remaining adapters (or clear if none left)
1270+
if (!apply_lora_adapters()) {
1271+
LOGE("Failed to re-apply remaining LoRA adapters after removal");
12521272
return false;
12531273
}
12541274

1255-
// Remove from tracking (adapter memory is freed automatically with the model
1256-
// per llama.cpp API — llama_adapter_lora_free is deprecated since b8011)
1257-
lora_adapters_.erase(it);
1258-
12591275
// Clear KV cache after adapter changes
12601276
llama_memory_clear(llama_get_memory(context_), true);
12611277

@@ -1271,7 +1287,7 @@ void LlamaCppTextGeneration::clear_lora_adapters() {
12711287
}
12721288

12731289
if (context_) {
1274-
llama_clear_adapter_lora(context_);
1290+
llama_set_adapters_lora(context_, nullptr, 0, nullptr);
12751291
llama_memory_clear(llama_get_memory(context_), true);
12761292
}
12771293

0 commit comments

Comments
 (0)