@@ -359,7 +359,7 @@ void configure_sampler(LlamaCppVLMBackend* backend, const rac_vlm_options_t* opt
359359 llama_sampler_chain_add (backend->sampler , llama_sampler_init_temp (temperature));
360360 llama_sampler_chain_add (backend->sampler , llama_sampler_init_dist (LLAMA_DEFAULT_SEED));
361361
362- RAC_LOG_INFO (LOG_CAT, " Sampler configured : temp=%.2f, top_p=%.2f, repeat=1.3, freq=0.1, pres=0.1, DRY=0.8, min_p=0.1" ,
362+ RAC_LOG_INFO (LOG_CAT, " [v3] Sampler : temp=%.2f top_p=%.2f repeat=1.3 freq=0.1 pres=0.1 DRY=0.8 min_p=0.1 + repeat_guard=4 " ,
363363 temperature, top_p);
364364}
365365
@@ -423,15 +423,53 @@ rac_result_t rac_vlm_llamacpp_load_model(rac_handle_t handle, const char* model_
423423 llama_backend_init ();
424424
425425 // Load model
426+ int gpu_layers = backend->config .gpu_layers ;
426427 llama_model_params model_params = llama_model_default_params ();
427- model_params.n_gpu_layers = backend-> config . gpu_layers ;
428+ model_params.n_gpu_layers = gpu_layers;
428429
429430 backend->model = llama_model_load_from_file (model_path, model_params);
430431 if (!backend->model ) {
431432 RAC_LOG_ERROR (LOG_CAT, " Failed to load model: %s" , model_path);
432433 return RAC_ERROR_MODEL_LOAD_FAILED;
433434 }
434435
436+ // Detect model type early — M-RoPE models (Qwen2-VL) produce NaN logits on
437+ // WebGPU due to shader precision limitations in the rotary position encoding.
438+ // The upstream WebGPU RoPE shader does contain M-RoPE handling, but f16
439+ // accumulation overflow causes all 151k+ logits to become NaN.
440+ //
441+ // Force CPU execution for these models by reloading with n_gpu_layers=0.
442+ // NOTE: default gpu_layers is -1 (all layers), so we check != 0 not > 0.
443+ //
444+ // PERFORMANCE: CPU fallback runs at ~1 tok/s in single-threaded WASM, which
445+ // is significantly slower than WebGPU-accelerated models like LFM2-VL (~15-20
446+ // tok/s). This is a correctness-over-speed trade-off until the WebGPU backend
447+ // resolves the M-RoPE precision issue.
448+ // TODO: re-test Qwen2-VL on WebGPU after future llama.cpp upgrades — the
449+ // Vulkan fp16 FA fix (b8168) and related precision work may eventually land
450+ // in the WebGPU backend as well.
451+ backend->model_type = detect_vlm_model_type (backend->model );
452+ bool force_cpu = false ;
453+
454+ #ifdef RAC_VLM_USE_MTMD
455+ if (backend->model_type == VLMModelType::Qwen2VL && gpu_layers != 0 ) {
456+ RAC_LOG_WARNING (LOG_CAT, " Qwen2-VL uses M-RoPE which is incompatible with WebGPU "
457+ " (gpu_layers=%d) — reloading with n_gpu_layers=0 for CPU execution" ,
458+ gpu_layers);
459+ llama_model_free (backend->model );
460+ backend->model = nullptr ;
461+
462+ model_params.n_gpu_layers = 0 ;
463+ backend->model = llama_model_load_from_file (model_path, model_params);
464+ if (!backend->model ) {
465+ RAC_LOG_ERROR (LOG_CAT, " Failed to reload model for CPU: %s" , model_path);
466+ return RAC_ERROR_MODEL_LOAD_FAILED;
467+ }
468+ force_cpu = true ;
469+ gpu_layers = 0 ;
470+ }
471+ #endif
472+
435473 // Determine context size
436474 int ctx_size = backend->config .context_size ;
437475 if (ctx_size <= 0 ) {
@@ -464,7 +502,8 @@ rac_result_t rac_vlm_llamacpp_load_model(rac_handle_t handle, const char* model_
464502 // Initialize mtmd context if mmproj provided
465503 if (mmproj_path && mmproj_path[0 ]) {
466504 mtmd_context_params mparams = mtmd_context_params_default ();
467- mparams.use_gpu = backend->config .use_gpu_vision ;
505+ // Force CPU for vision encoder too when model requires CPU (M-RoPE)
506+ mparams.use_gpu = force_cpu ? false : backend->config .use_gpu_vision ;
468507 mparams.n_threads = n_threads;
469508 mparams.print_timings = false ;
470509 mparams.warmup = true ;
@@ -475,7 +514,8 @@ rac_result_t rac_vlm_llamacpp_load_model(rac_handle_t handle, const char* model_
475514 // Continue without vision - will work as text-only LLM
476515 RAC_LOG_WARNING (LOG_CAT, " VLM will operate in text-only mode" );
477516 } else {
478- RAC_LOG_INFO (LOG_CAT, " Vision projector loaded successfully" );
517+ RAC_LOG_INFO (LOG_CAT, " Vision projector loaded successfully%s" ,
518+ force_cpu ? " (CPU mode for M-RoPE compat)" : " " );
479519 }
480520 backend->mmproj_path = mmproj_path;
481521 }
@@ -485,10 +525,8 @@ rac_result_t rac_vlm_llamacpp_load_model(rac_handle_t handle, const char* model_
485525 backend->model_loaded = true ;
486526 backend->n_past = 0 ;
487527
488- // Detect model type for chat template
489- backend->model_type = detect_vlm_model_type (backend->model );
490-
491- RAC_LOG_INFO (LOG_CAT, " VLM model loaded successfully (ctx=%d, threads=%d)" , ctx_size, n_threads);
528+ RAC_LOG_INFO (LOG_CAT, " VLM model loaded (ctx=%d, threads=%d, gpu_layers=%d%s) [build:v4-cpu-mrope]" ,
529+ ctx_size, n_threads, gpu_layers, force_cpu ? " , forced-cpu" : " " );
492530 return RAC_SUCCESS;
493531}
494532
@@ -621,6 +659,10 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
621659 full_prompt = format_vlm_prompt_with_template (backend->model , prompt, image_marker, has_image,
622660 system_prompt, effective_model_type);
623661
662+ RAC_LOG_INFO (LOG_CAT, " [v3-process] Prompt (%d chars, img=%d, type=%d): %.200s" ,
663+ (int )full_prompt.length (), has_image ? 1 : 0 , (int )effective_model_type,
664+ full_prompt.c_str ());
665+
624666 // Tokenize and evaluate
625667 if (backend->mtmd_ctx && bitmap) {
626668 mtmd_input_chunks* chunks = mtmd_input_chunks_init ();
@@ -709,14 +751,70 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
709751 llama_batch batch = llama_batch_init (1 , 0 , 1 );
710752 const llama_vocab* vocab = llama_model_get_vocab (backend->model );
711753
754+ // Runtime repetition guard: track last token and consecutive repeat count.
755+ // If the same token appears too many times in a row, the model is stuck and
756+ // we force-stop to avoid emitting garbage like "gó gó gó gó ...".
757+ llama_token prev_token = -1 ;
758+ int repeat_run = 0 ;
759+ constexpr int MAX_CONSECUTIVE_REPEATS = 4 ;
760+
712761 for (int i = 0 ; i < max_tokens && !backend->cancel_requested ; i++) {
762+ // Diagnostic: on first token, inspect logits for NaN/corruption
763+ if (i == 0 ) {
764+ float * logits = llama_get_logits (backend->ctx );
765+ int n_vocab = llama_vocab_n_tokens (vocab);
766+ if (logits && n_vocab > 0 ) {
767+ float max_logit = logits[0 ];
768+ int max_idx = 0 ;
769+ int nan_count = 0 ;
770+ int inf_count = 0 ;
771+ for (int v = 0 ; v < n_vocab; v++) {
772+ if (logits[v] != logits[v]) nan_count++; // NaN check
773+ if (logits[v] > 1e30f || logits[v] < -1e30f) inf_count++;
774+ if (logits[v] > max_logit) { max_logit = logits[v]; max_idx = v; }
775+ }
776+ RAC_LOG_INFO (LOG_CAT, " [v3-diag] Logits: n_vocab=%d, max_logit=%.4f at token %d, NaN=%d, Inf=%d" ,
777+ n_vocab, max_logit, max_idx, nan_count, inf_count);
778+ // Log top 5 logits
779+ float top5_val[5 ] = {-1e30f, -1e30f, -1e30f, -1e30f, -1e30f};
780+ int top5_idx[5 ] = {0 , 0 , 0 , 0 , 0 };
781+ for (int v = 0 ; v < n_vocab; v++) {
782+ if (logits[v] != logits[v]) continue ; // skip NaN
783+ for (int k = 0 ; k < 5 ; k++) {
784+ if (logits[v] > top5_val[k]) {
785+ for (int j = 4 ; j > k; j--) { top5_val[j] = top5_val[j-1 ]; top5_idx[j] = top5_idx[j-1 ]; }
786+ top5_val[k] = logits[v]; top5_idx[k] = v;
787+ break ;
788+ }
789+ }
790+ }
791+ RAC_LOG_INFO (LOG_CAT, " [v3-diag] Top5: [%d]=%.2f [%d]=%.2f [%d]=%.2f [%d]=%.2f [%d]=%.2f" ,
792+ top5_idx[0 ], top5_val[0 ], top5_idx[1 ], top5_val[1 ],
793+ top5_idx[2 ], top5_val[2 ], top5_idx[3 ], top5_val[3 ],
794+ top5_idx[4 ], top5_val[4 ]);
795+ }
796+ }
797+
713798 llama_token token = llama_sampler_sample (backend->sampler , backend->ctx , -1 );
714799 llama_sampler_accept (backend->sampler , token);
715800
716801 if (llama_vocab_is_eog (vocab, token)) {
717802 break ;
718803 }
719804
805+ // Detect stuck generation: same token repeated consecutively
806+ if (token == prev_token) {
807+ repeat_run++;
808+ if (repeat_run >= MAX_CONSECUTIVE_REPEATS) {
809+ RAC_LOG_WARNING (LOG_CAT, " Repetition guard: token %d repeated %d times, stopping" ,
810+ token, repeat_run + 1 );
811+ break ;
812+ }
813+ } else {
814+ repeat_run = 0 ;
815+ }
816+ prev_token = token;
817+
720818 char buf[256 ];
721819 int len = llama_token_to_piece (vocab, token, buf, sizeof (buf), 0 , true );
722820 if (len > 0 ) {
@@ -813,10 +911,14 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
813911 }
814912 }
815913
816- // Format prompt using model's built-in chat template
914+ // Format prompt using model's built-in chat template (streaming path)
817915 full_prompt = format_vlm_prompt_with_template (backend->model , prompt, image_marker, has_image,
818916 system_prompt, effective_model_type);
819917
918+ RAC_LOG_INFO (LOG_CAT, " [v3-stream] Prompt (%d chars, img=%d, type=%d): %.200s" ,
919+ (int )full_prompt.length (), has_image ? 1 : 0 , (int )effective_model_type,
920+ full_prompt.c_str ());
921+
820922 // Tokenize and evaluate
821923 if (backend->mtmd_ctx && bitmap) {
822924 mtmd_input_chunks* chunks = mtmd_input_chunks_init ();
@@ -901,12 +1003,33 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
9011003 llama_batch batch = llama_batch_init (1 , 0 , 1 );
9021004 const llama_vocab* vocab = llama_model_get_vocab (backend->model );
9031005
1006+ // Runtime repetition guard (same as non-streaming path)
1007+ llama_token prev_token = -1 ;
1008+ int repeat_run = 0 ;
1009+ constexpr int MAX_CONSECUTIVE_REPEATS = 4 ;
1010+
9041011 for (int i = 0 ; i < max_tokens && !backend->cancel_requested ; i++) {
9051012 llama_token token = llama_sampler_sample (backend->sampler , backend->ctx , -1 );
9061013 llama_sampler_accept (backend->sampler , token);
9071014
9081015 bool is_eog = llama_vocab_is_eog (vocab, token);
9091016
1017+ // Detect stuck generation
1018+ if (!is_eog) {
1019+ if (token == prev_token) {
1020+ repeat_run++;
1021+ if (repeat_run >= MAX_CONSECUTIVE_REPEATS) {
1022+ RAC_LOG_WARNING (LOG_CAT, " Repetition guard: token %d repeated %d times, stopping" ,
1023+ token, repeat_run + 1 );
1024+ callback (" " , RAC_TRUE, user_data);
1025+ break ;
1026+ }
1027+ } else {
1028+ repeat_run = 0 ;
1029+ }
1030+ prev_token = token;
1031+ }
1032+
9101033 char buf[256 ];
9111034 int len = llama_token_to_piece (vocab, token, buf, sizeof (buf), 0 , true );
9121035 if (len > 0 ) {
0 commit comments