Skip to content

Commit 6730759

Browse files
Upgrade llama.cpp to b8179 and fix WASM VLM build for new mtmd model sources
- Bump LLAMACPP_VERSION from b8011 to b8179 for Vulkan/WebGPU precision fixes and FlashAttention improvements - Adapt LoRA adapter API calls in llamacpp_backend.cpp to the new llama_set_adapters_lora() batch API (replaces removed per-adapter functions) - Add nemotron-v2-vl.cpp and paddleocr.cpp to CMakeLists.txt for new clip_graph_* model types introduced in b8179 - Document Qwen2-VL CPU fallback performance (~1 tok/s) caused by M-RoPE NaN logits on WebGPU in rac_vlm_llamacpp.cpp, VLMWorkerBridge.ts, and model-manager.ts
1 parent f7cbb78 commit 6730759

6 files changed

Lines changed: 197 additions & 29 deletions

File tree

examples/web/RunAnywhereAI/src/services/model-manager.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ const REGISTERED_MODELS: CompactModelDef[] = [
102102
modality: ModelCategory.Multimodal,
103103
memoryRequirement: 600_000_000,
104104
},
105+
// NOTE: Qwen2-VL uses M-RoPE which produces NaN logits on WebGPU. It falls
106+
// back to CPU WASM (~1 tok/s) — noticeably slower than LFM2-VL on WebGPU.
105107
{
106108
id: 'qwen2-vl-2b-instruct-q4_k_m',
107109
name: 'Qwen2-VL 2B Instruct Q4_K_M',

sdk/runanywhere-commons/VERSIONS

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,10 @@ SHERPA_ONNX_VERSION_LINUX=1.12.23
7272
# =============================================================================
7373
# llama.cpp (LLM inference)
7474
# =============================================================================
75-
# b8011 - latest stable release (Feb 2026), includes GGML_WEBGPU backend
76-
# NOTE: Bumped from b7650 to enable WebGPU acceleration for WASM builds
77-
LLAMACPP_VERSION=b8011
75+
# b8179 - latest stable release (Feb 2026), includes GGML_WEBGPU backend
76+
# NOTE: Bumped from b8011. Includes Vulkan/WebGPU precision fixes and
77+
# FlashAttention improvements. Qwen2-VL M-RoPE WebGPU NaN may be resolved.
78+
LLAMACPP_VERSION=b8179
7879

7980
# =============================================================================
8081
# nlohmann/json

sdk/runanywhere-commons/src/backends/llamacpp/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ if(RAC_VLM_USE_MTMD)
153153
${llamacpp_SOURCE_DIR}/tools/mtmd/models/whisper-enc.cpp
154154
${llamacpp_SOURCE_DIR}/tools/mtmd/models/kimik25.cpp
155155
${llamacpp_SOURCE_DIR}/tools/mtmd/models/mobilenetv5.cpp
156+
${llamacpp_SOURCE_DIR}/tools/mtmd/models/nemotron-v2-vl.cpp
157+
${llamacpp_SOURCE_DIR}/tools/mtmd/models/paddleocr.cpp
156158
)
157159
endif()
158160

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ bool LlamaCppTextGeneration::unload_model_internal() {
371371
// Clear LoRA adapters from context before freeing
372372
// (adapter memory is freed automatically with the model per llama.cpp API)
373373
if (context_ && !lora_adapters_.empty()) {
374-
llama_clear_adapter_lora(context_);
374+
llama_set_adapters_lora(context_, nullptr, 0, nullptr);
375375
}
376376
lora_adapters_.clear();
377377

@@ -828,13 +828,29 @@ bool LlamaCppTextGeneration::recreate_context() {
828828
}
829829

830830
bool LlamaCppTextGeneration::apply_lora_adapters() {
831+
if (lora_adapters_.empty()) {
832+
return true;
833+
}
834+
835+
std::vector<llama_adapter_lora*> adapters;
836+
std::vector<float> scales;
837+
adapters.reserve(lora_adapters_.size());
838+
scales.reserve(lora_adapters_.size());
839+
840+
for (auto& entry : lora_adapters_) {
841+
adapters.push_back(entry.adapter);
842+
scales.push_back(entry.scale);
843+
}
844+
845+
int32_t result = llama_set_adapters_lora(context_, adapters.data(),
846+
adapters.size(), scales.data());
847+
if (result != 0) {
848+
LOGE("Failed to apply LoRA adapters (error=%d)", result);
849+
for (auto& entry : lora_adapters_) { entry.applied = false; }
850+
return false;
851+
}
852+
831853
for (auto& entry : lora_adapters_) {
832-
int32_t result = llama_set_adapter_lora(context_, entry.adapter, entry.scale);
833-
if (result != 0) {
834-
LOGE("Failed to apply LoRA adapter: %s (error=%d)", entry.path.c_str(), result);
835-
entry.applied = false;
836-
return false;
837-
}
838854
entry.applied = true;
839855
LOGI("Applied LoRA adapter: %s (scale=%.2f)", entry.path.c_str(), entry.scale);
840856
}
@@ -911,17 +927,19 @@ bool LlamaCppTextGeneration::remove_lora_adapter(const std::string& adapter_path
911927
return false;
912928
}
913929

914-
// Remove from context
915-
int32_t result = llama_rm_adapter_lora(context_, it->adapter);
916-
if (result != 0) {
917-
LOGE("Failed to remove LoRA adapter from context: %s (error=%d)", adapter_path.c_str(), result);
918-
return false;
919-
}
920-
921-
// Remove from tracking (adapter memory is freed automatically with the model
922-
// per llama.cpp API — llama_adapter_lora_free is deprecated since b8011)
930+
// Remove from tracking and re-apply remaining adapters
923931
lora_adapters_.erase(it);
924932

933+
// Re-apply remaining adapters (or clear if none left)
934+
std::vector<llama_adapter_lora*> adapters;
935+
std::vector<float> scales;
936+
for (auto& entry : lora_adapters_) {
937+
adapters.push_back(entry.adapter);
938+
scales.push_back(entry.scale);
939+
}
940+
llama_set_adapters_lora(context_, adapters.empty() ? nullptr : adapters.data(),
941+
adapters.size(), adapters.empty() ? nullptr : scales.data());
942+
925943
// Clear KV cache after adapter changes
926944
llama_memory_clear(llama_get_memory(context_), true);
927945

@@ -937,7 +955,7 @@ void LlamaCppTextGeneration::clear_lora_adapters() {
937955
}
938956

939957
if (context_) {
940-
llama_clear_adapter_lora(context_);
958+
llama_set_adapters_lora(context_, nullptr, 0, nullptr);
941959
llama_memory_clear(llama_get_memory(context_), true);
942960
}
943961

sdk/runanywhere-commons/src/backends/llamacpp/rac_vlm_llamacpp.cpp

Lines changed: 132 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,7 @@ void configure_sampler(LlamaCppVLMBackend* backend, const rac_vlm_options_t* opt
359359
llama_sampler_chain_add(backend->sampler, llama_sampler_init_temp(temperature));
360360
llama_sampler_chain_add(backend->sampler, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
361361

362-
RAC_LOG_INFO(LOG_CAT, "Sampler configured: temp=%.2f, top_p=%.2f, repeat=1.3, freq=0.1, pres=0.1, DRY=0.8, min_p=0.1",
362+
RAC_LOG_INFO(LOG_CAT, "[v3] Sampler: temp=%.2f top_p=%.2f repeat=1.3 freq=0.1 pres=0.1 DRY=0.8 min_p=0.1 + repeat_guard=4",
363363
temperature, top_p);
364364
}
365365

@@ -423,15 +423,53 @@ rac_result_t rac_vlm_llamacpp_load_model(rac_handle_t handle, const char* model_
423423
llama_backend_init();
424424

425425
// Load model
426+
int gpu_layers = backend->config.gpu_layers;
426427
llama_model_params model_params = llama_model_default_params();
427-
model_params.n_gpu_layers = backend->config.gpu_layers;
428+
model_params.n_gpu_layers = gpu_layers;
428429

429430
backend->model = llama_model_load_from_file(model_path, model_params);
430431
if (!backend->model) {
431432
RAC_LOG_ERROR(LOG_CAT, "Failed to load model: %s", model_path);
432433
return RAC_ERROR_MODEL_LOAD_FAILED;
433434
}
434435

436+
// Detect model type early — M-RoPE models (Qwen2-VL) produce NaN logits on
437+
// WebGPU due to shader precision limitations in the rotary position encoding.
438+
// The upstream WebGPU RoPE shader does contain M-RoPE handling, but f16
439+
// accumulation overflow causes all 151k+ logits to become NaN.
440+
//
441+
// Force CPU execution for these models by reloading with n_gpu_layers=0.
442+
// NOTE: default gpu_layers is -1 (all layers), so we check != 0 not > 0.
443+
//
444+
// PERFORMANCE: CPU fallback runs at ~1 tok/s in single-threaded WASM, which
445+
// is significantly slower than WebGPU-accelerated models like LFM2-VL (~15-20
446+
// tok/s). This is a correctness-over-speed trade-off until the WebGPU backend
447+
// resolves the M-RoPE precision issue.
448+
// TODO: re-test Qwen2-VL on WebGPU after future llama.cpp upgrades — the
449+
// Vulkan fp16 FA fix (b8168) and related precision work may eventually land
450+
// in the WebGPU backend as well.
451+
backend->model_type = detect_vlm_model_type(backend->model);
452+
bool force_cpu = false;
453+
454+
#ifdef RAC_VLM_USE_MTMD
455+
if (backend->model_type == VLMModelType::Qwen2VL && gpu_layers != 0) {
456+
RAC_LOG_WARNING(LOG_CAT, "Qwen2-VL uses M-RoPE which is incompatible with WebGPU "
457+
"(gpu_layers=%d) — reloading with n_gpu_layers=0 for CPU execution",
458+
gpu_layers);
459+
llama_model_free(backend->model);
460+
backend->model = nullptr;
461+
462+
model_params.n_gpu_layers = 0;
463+
backend->model = llama_model_load_from_file(model_path, model_params);
464+
if (!backend->model) {
465+
RAC_LOG_ERROR(LOG_CAT, "Failed to reload model for CPU: %s", model_path);
466+
return RAC_ERROR_MODEL_LOAD_FAILED;
467+
}
468+
force_cpu = true;
469+
gpu_layers = 0;
470+
}
471+
#endif
472+
435473
// Determine context size
436474
int ctx_size = backend->config.context_size;
437475
if (ctx_size <= 0) {
@@ -464,7 +502,8 @@ rac_result_t rac_vlm_llamacpp_load_model(rac_handle_t handle, const char* model_
464502
// Initialize mtmd context if mmproj provided
465503
if (mmproj_path && mmproj_path[0]) {
466504
mtmd_context_params mparams = mtmd_context_params_default();
467-
mparams.use_gpu = backend->config.use_gpu_vision;
505+
// Force CPU for vision encoder too when model requires CPU (M-RoPE)
506+
mparams.use_gpu = force_cpu ? false : backend->config.use_gpu_vision;
468507
mparams.n_threads = n_threads;
469508
mparams.print_timings = false;
470509
mparams.warmup = true;
@@ -475,7 +514,8 @@ rac_result_t rac_vlm_llamacpp_load_model(rac_handle_t handle, const char* model_
475514
// Continue without vision - will work as text-only LLM
476515
RAC_LOG_WARNING(LOG_CAT, "VLM will operate in text-only mode");
477516
} else {
478-
RAC_LOG_INFO(LOG_CAT, "Vision projector loaded successfully");
517+
RAC_LOG_INFO(LOG_CAT, "Vision projector loaded successfully%s",
518+
force_cpu ? " (CPU mode for M-RoPE compat)" : "");
479519
}
480520
backend->mmproj_path = mmproj_path;
481521
}
@@ -485,10 +525,8 @@ rac_result_t rac_vlm_llamacpp_load_model(rac_handle_t handle, const char* model_
485525
backend->model_loaded = true;
486526
backend->n_past = 0;
487527

488-
// Detect model type for chat template
489-
backend->model_type = detect_vlm_model_type(backend->model);
490-
491-
RAC_LOG_INFO(LOG_CAT, "VLM model loaded successfully (ctx=%d, threads=%d)", ctx_size, n_threads);
528+
RAC_LOG_INFO(LOG_CAT, "VLM model loaded (ctx=%d, threads=%d, gpu_layers=%d%s) [build:v4-cpu-mrope]",
529+
ctx_size, n_threads, gpu_layers, force_cpu ? ", forced-cpu" : "");
492530
return RAC_SUCCESS;
493531
}
494532

@@ -621,6 +659,10 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
621659
full_prompt = format_vlm_prompt_with_template(backend->model, prompt, image_marker, has_image,
622660
system_prompt, effective_model_type);
623661

662+
RAC_LOG_INFO(LOG_CAT, "[v3-process] Prompt (%d chars, img=%d, type=%d): %.200s",
663+
(int)full_prompt.length(), has_image ? 1 : 0, (int)effective_model_type,
664+
full_prompt.c_str());
665+
624666
// Tokenize and evaluate
625667
if (backend->mtmd_ctx && bitmap) {
626668
mtmd_input_chunks* chunks = mtmd_input_chunks_init();
@@ -709,14 +751,70 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
709751
llama_batch batch = llama_batch_init(1, 0, 1);
710752
const llama_vocab* vocab = llama_model_get_vocab(backend->model);
711753

754+
// Runtime repetition guard: track last token and consecutive repeat count.
755+
// If the same token appears too many times in a row, the model is stuck and
756+
// we force-stop to avoid emitting garbage like "gó gó gó gó ...".
757+
llama_token prev_token = -1;
758+
int repeat_run = 0;
759+
constexpr int MAX_CONSECUTIVE_REPEATS = 4;
760+
712761
for (int i = 0; i < max_tokens && !backend->cancel_requested; i++) {
762+
// Diagnostic: on first token, inspect logits for NaN/corruption
763+
if (i == 0) {
764+
float* logits = llama_get_logits(backend->ctx);
765+
int n_vocab = llama_vocab_n_tokens(vocab);
766+
if (logits && n_vocab > 0) {
767+
float max_logit = logits[0];
768+
int max_idx = 0;
769+
int nan_count = 0;
770+
int inf_count = 0;
771+
for (int v = 0; v < n_vocab; v++) {
772+
if (logits[v] != logits[v]) nan_count++; // NaN check
773+
if (logits[v] > 1e30f || logits[v] < -1e30f) inf_count++;
774+
if (logits[v] > max_logit) { max_logit = logits[v]; max_idx = v; }
775+
}
776+
RAC_LOG_INFO(LOG_CAT, "[v3-diag] Logits: n_vocab=%d, max_logit=%.4f at token %d, NaN=%d, Inf=%d",
777+
n_vocab, max_logit, max_idx, nan_count, inf_count);
778+
// Log top 5 logits
779+
float top5_val[5] = {-1e30f, -1e30f, -1e30f, -1e30f, -1e30f};
780+
int top5_idx[5] = {0, 0, 0, 0, 0};
781+
for (int v = 0; v < n_vocab; v++) {
782+
if (logits[v] != logits[v]) continue; // skip NaN
783+
for (int k = 0; k < 5; k++) {
784+
if (logits[v] > top5_val[k]) {
785+
for (int j = 4; j > k; j--) { top5_val[j] = top5_val[j-1]; top5_idx[j] = top5_idx[j-1]; }
786+
top5_val[k] = logits[v]; top5_idx[k] = v;
787+
break;
788+
}
789+
}
790+
}
791+
RAC_LOG_INFO(LOG_CAT, "[v3-diag] Top5: [%d]=%.2f [%d]=%.2f [%d]=%.2f [%d]=%.2f [%d]=%.2f",
792+
top5_idx[0], top5_val[0], top5_idx[1], top5_val[1],
793+
top5_idx[2], top5_val[2], top5_idx[3], top5_val[3],
794+
top5_idx[4], top5_val[4]);
795+
}
796+
}
797+
713798
llama_token token = llama_sampler_sample(backend->sampler, backend->ctx, -1);
714799
llama_sampler_accept(backend->sampler, token);
715800

716801
if (llama_vocab_is_eog(vocab, token)) {
717802
break;
718803
}
719804

805+
// Detect stuck generation: same token repeated consecutively
806+
if (token == prev_token) {
807+
repeat_run++;
808+
if (repeat_run >= MAX_CONSECUTIVE_REPEATS) {
809+
RAC_LOG_WARNING(LOG_CAT, "Repetition guard: token %d repeated %d times, stopping",
810+
token, repeat_run + 1);
811+
break;
812+
}
813+
} else {
814+
repeat_run = 0;
815+
}
816+
prev_token = token;
817+
720818
char buf[256];
721819
int len = llama_token_to_piece(vocab, token, buf, sizeof(buf), 0, true);
722820
if (len > 0) {
@@ -813,10 +911,14 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
813911
}
814912
}
815913

816-
// Format prompt using model's built-in chat template
914+
// Format prompt using model's built-in chat template (streaming path)
817915
full_prompt = format_vlm_prompt_with_template(backend->model, prompt, image_marker, has_image,
818916
system_prompt, effective_model_type);
819917

918+
RAC_LOG_INFO(LOG_CAT, "[v3-stream] Prompt (%d chars, img=%d, type=%d): %.200s",
919+
(int)full_prompt.length(), has_image ? 1 : 0, (int)effective_model_type,
920+
full_prompt.c_str());
921+
820922
// Tokenize and evaluate
821923
if (backend->mtmd_ctx && bitmap) {
822924
mtmd_input_chunks* chunks = mtmd_input_chunks_init();
@@ -901,12 +1003,33 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
9011003
llama_batch batch = llama_batch_init(1, 0, 1);
9021004
const llama_vocab* vocab = llama_model_get_vocab(backend->model);
9031005

1006+
// Runtime repetition guard (same as non-streaming path)
1007+
llama_token prev_token = -1;
1008+
int repeat_run = 0;
1009+
constexpr int MAX_CONSECUTIVE_REPEATS = 4;
1010+
9041011
for (int i = 0; i < max_tokens && !backend->cancel_requested; i++) {
9051012
llama_token token = llama_sampler_sample(backend->sampler, backend->ctx, -1);
9061013
llama_sampler_accept(backend->sampler, token);
9071014

9081015
bool is_eog = llama_vocab_is_eog(vocab, token);
9091016

1017+
// Detect stuck generation
1018+
if (!is_eog) {
1019+
if (token == prev_token) {
1020+
repeat_run++;
1021+
if (repeat_run >= MAX_CONSECUTIVE_REPEATS) {
1022+
RAC_LOG_WARNING(LOG_CAT, "Repetition guard: token %d repeated %d times, stopping",
1023+
token, repeat_run + 1);
1024+
callback("", RAC_TRUE, user_data);
1025+
break;
1026+
}
1027+
} else {
1028+
repeat_run = 0;
1029+
}
1030+
prev_token = token;
1031+
}
1032+
9101033
char buf[256];
9111034
int len = llama_token_to_piece(vocab, token, buf, sizeof(buf), 0, true);
9121035
if (len > 0) {

sdk/runanywhere-web/packages/llamacpp/src/Infrastructure/VLMWorkerBridge.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,28 @@ export class VLMWorkerBridge {
240240
await this.init();
241241
}
242242

243+
// M-RoPE models (Qwen2-VL) produce NaN logits on WebGPU due to f16
244+
// accumulation overflow in the rotary position encoding shader. If we
245+
// detect one, restart the Worker with the CPU WASM binary so the entire
246+
// inference runs on the CPU backend.
247+
//
248+
// PERFORMANCE: The CPU WASM binary is single-threaded (pthreads OFF), so
249+
// Qwen2-VL runs at ~1 tok/s vs ~15-20 tok/s for WebGPU models (LFM2-VL).
250+
// This is a correctness-over-speed trade-off.
251+
// TODO: re-test on WebGPU periodically as llama.cpp's WebGPU backend
252+
// matures — the Vulkan fp16 FA fix (b8168) may eventually be ported.
253+
const bridge = LlamaCppBridge.shared;
254+
const isQwenVL = /qwen/i.test(params.modelId) || /qwen/i.test(params.modelName);
255+
if (isQwenVL && bridge.accelerationMode === 'webgpu') {
256+
const currentUrl = bridge.wasmUrl ?? '';
257+
const cpuUrl = currentUrl.replace(/-webgpu\.js$/, '.js');
258+
if (cpuUrl !== currentUrl) {
259+
logger.info('Qwen2-VL detected — restarting VLM Worker with CPU WASM (M-RoPE compat)');
260+
this.terminate();
261+
await this.init(cpuUrl);
262+
}
263+
}
264+
243265
// Transfer data buffers when provided (zero-copy to Worker)
244266
const transferables: Transferable[] = [];
245267
if (params.modelData) transferables.push(params.modelData);

0 commit comments

Comments
 (0)