Skip to content

Commit f7cbb78

Browse files
minor fixes
1 parent 4a2ea17 commit f7cbb78

4 files changed

Lines changed: 102 additions & 13 deletions

File tree

examples/web/RunAnywhereAI/src/views/vision.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@ async function processFrame(frame: CapturedFrame, prompt: string, maxTokens: num
413413
frame.width,
414414
frame.height,
415415
prompt,
416-
{ maxTokens, temperature: 0.7 },
416+
{ maxTokens, temperature: 0.7, systemPrompt: 'You are a helpful assistant.' },
417417
);
418418

419419
// Compute metrics from JS wall clock

sdk/runanywhere-commons/src/backends/llamacpp/rac_vlm_llamacpp.cpp

Lines changed: 93 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -161,9 +161,14 @@ VLMModelType detect_vlm_model_type(llama_model* model) {
161161
/**
162162
* Format prompt using model's built-in chat template via llama_chat_apply_template.
163163
* Falls back to manual formatting if template application fails.
164+
*
165+
* When system_prompt is provided, it is prepended as a system message.
166+
* For models that expect a system message (e.g. Qwen2-VL), a default is
167+
* injected based on the detected model_type when no explicit prompt is given.
164168
*/
165169
std::string format_vlm_prompt_with_template(llama_model* model, const std::string& user_prompt,
166-
const char* image_marker, bool has_image) {
170+
const char* image_marker, bool has_image,
171+
const char* system_prompt, VLMModelType model_type) {
167172
// Build user content with image marker if present
168173
std::string user_content;
169174
if (has_image) {
@@ -172,18 +177,44 @@ std::string format_vlm_prompt_with_template(llama_model* model, const std::strin
172177
user_content = user_prompt;
173178
}
174179

180+
// Resolve system prompt: use explicit value, or inject a default for Qwen2-VL
181+
const char* effective_system = system_prompt;
182+
if (!effective_system && model_type == VLMModelType::Qwen2VL) {
183+
effective_system = "You are a helpful assistant.";
184+
}
185+
175186
// Get the model's chat template
176187
const char* tmpl = llama_model_chat_template(model, nullptr);
177188

178189
// Try to use llama_chat_apply_template
179190
if (tmpl) {
180191
RAC_LOG_DEBUG(LOG_CAT, "Using model chat template: %.80s...", tmpl);
181192

193+
if (effective_system) {
194+
llama_chat_message messages[2];
195+
messages[0].role = "system";
196+
messages[0].content = effective_system;
197+
messages[1].role = "user";
198+
messages[1].content = user_content.c_str();
199+
200+
int32_t size = llama_chat_apply_template(tmpl, messages, 2, true, nullptr, 0);
201+
if (size > 0) {
202+
std::vector<char> buf(size + 1);
203+
int32_t result = llama_chat_apply_template(tmpl, messages, 2, true, buf.data(), buf.size());
204+
if (result > 0) {
205+
std::string formatted(buf.data(), result);
206+
RAC_LOG_DEBUG(LOG_CAT, "Template-formatted prompt with system (%d chars): %s",
207+
(int)formatted.length(), formatted.c_str());
208+
return formatted;
209+
}
210+
}
211+
RAC_LOG_WARNING(LOG_CAT, "llama_chat_apply_template with system failed (size=%d), trying without", size);
212+
}
213+
182214
llama_chat_message messages[1];
183215
messages[0].role = "user";
184216
messages[0].content = user_content.c_str();
185217

186-
// First call to get required buffer size
187218
int32_t size = llama_chat_apply_template(tmpl, messages, 1, true, nullptr, 0);
188219
if (size > 0) {
189220
std::vector<char> buf(size + 1);
@@ -201,7 +232,13 @@ std::string format_vlm_prompt_with_template(llama_model* model, const std::strin
201232
}
202233

203234
// Fallback: manual chatml format (works for most models)
204-
std::string formatted = "<|im_start|>user\n";
235+
std::string formatted;
236+
if (effective_system) {
237+
formatted = "<|im_start|>system\n";
238+
formatted += effective_system;
239+
formatted += "<|im_end|>\n";
240+
}
241+
formatted += "<|im_start|>user\n";
205242
formatted += user_content;
206243
formatted += "<|im_end|>\n<|im_start|>assistant\n";
207244

@@ -300,14 +337,30 @@ void configure_sampler(LlamaCppVLMBackend* backend, const rac_vlm_options_t* opt
300337
}
301338
}
302339

303-
// Build new sampler chain
340+
// Build new sampler chain.
341+
// Order follows llama.cpp common_sampler_init: penalties → DRY → top_p → min_p → temp → dist.
342+
// Penalties and DRY must be applied to raw logits before temperature softens them.
304343
llama_sampler_chain_params sampler_params = llama_sampler_chain_default_params();
305344
backend->sampler = llama_sampler_chain_init(sampler_params);
306-
llama_sampler_chain_add(backend->sampler, llama_sampler_init_temp(temperature));
345+
346+
// Token-level repetition penalty + frequency/presence penalties
347+
llama_sampler_chain_add(backend->sampler, llama_sampler_init_penalties(256, 1.3f, 0.1f, 0.1f));
348+
349+
// DRY sampler: catches n-gram (sequence) repetition like "gó gó gó" where individual
350+
// tokens may alternate. Multiplier=0.8, base=1.75, allowed_length=2, last_n=256.
351+
const llama_vocab* vocab = llama_model_get_vocab(backend->model);
352+
static const char* dry_breakers[] = { "\n", ":", "\"", "*" };
353+
llama_sampler_chain_add(backend->sampler, llama_sampler_init_dry(
354+
vocab, llama_model_n_ctx_train(backend->model),
355+
0.8f, 1.75f, 2, 256, dry_breakers, 4));
356+
307357
llama_sampler_chain_add(backend->sampler, llama_sampler_init_top_p(top_p, 1));
358+
llama_sampler_chain_add(backend->sampler, llama_sampler_init_min_p(0.1f, 1));
359+
llama_sampler_chain_add(backend->sampler, llama_sampler_init_temp(temperature));
308360
llama_sampler_chain_add(backend->sampler, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
309361

310-
RAC_LOG_DEBUG(LOG_CAT, "Sampler configured: temp=%.2f, top_p=%.2f", temperature, top_p);
362+
RAC_LOG_INFO(LOG_CAT, "Sampler configured: temp=%.2f, top_p=%.2f, repeat=1.3, freq=0.1, pres=0.1, DRY=0.8, min_p=0.1",
363+
temperature, top_p);
311364
}
312365

313366
} // namespace
@@ -524,6 +577,19 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
524577
}
525578
backend->n_past = 0;
526579

580+
// Resolve effective model type: options override > auto-detected at load time
581+
VLMModelType effective_model_type = backend->model_type;
582+
if (options && options->model_family != RAC_VLM_MODEL_FAMILY_AUTO) {
583+
switch (options->model_family) {
584+
case RAC_VLM_MODEL_FAMILY_QWEN2_VL: effective_model_type = VLMModelType::Qwen2VL; break;
585+
case RAC_VLM_MODEL_FAMILY_SMOLVLM: effective_model_type = VLMModelType::SmolVLM; break;
586+
case RAC_VLM_MODEL_FAMILY_LLAVA: effective_model_type = VLMModelType::LLaVA; break;
587+
default: effective_model_type = VLMModelType::Generic; break;
588+
}
589+
}
590+
591+
const char* system_prompt = (options && options->system_prompt) ? options->system_prompt : nullptr;
592+
527593
// Build the prompt with proper chat template formatting
528594
std::string full_prompt;
529595
bool has_image = false;
@@ -552,7 +618,8 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
552618
}
553619

554620
// Format prompt using model's built-in chat template
555-
full_prompt = format_vlm_prompt_with_template(backend->model, prompt, image_marker, has_image);
621+
full_prompt = format_vlm_prompt_with_template(backend->model, prompt, image_marker, has_image,
622+
system_prompt, effective_model_type);
556623

557624
// Tokenize and evaluate
558625
if (backend->mtmd_ctx && bitmap) {
@@ -599,7 +666,8 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
599666
#endif
600667
{
601668
// Text-only mode - still apply chat template for consistent formatting
602-
full_prompt = format_vlm_prompt_with_template(backend->model, prompt, image_marker, false);
669+
full_prompt = format_vlm_prompt_with_template(backend->model, prompt, image_marker, false,
670+
system_prompt, effective_model_type);
603671

604672
const llama_vocab* vocab = llama_model_get_vocab(backend->model);
605673
std::vector<llama_token> tokens(full_prompt.size() + 16);
@@ -710,6 +778,19 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
710778
backend->n_past = 0;
711779
RAC_LOG_DEBUG(LOG_CAT, "Cleared KV cache for new request");
712780

781+
// Resolve effective model type: options override > auto-detected at load time
782+
VLMModelType effective_model_type = backend->model_type;
783+
if (options && options->model_family != RAC_VLM_MODEL_FAMILY_AUTO) {
784+
switch (options->model_family) {
785+
case RAC_VLM_MODEL_FAMILY_QWEN2_VL: effective_model_type = VLMModelType::Qwen2VL; break;
786+
case RAC_VLM_MODEL_FAMILY_SMOLVLM: effective_model_type = VLMModelType::SmolVLM; break;
787+
case RAC_VLM_MODEL_FAMILY_LLAVA: effective_model_type = VLMModelType::LLaVA; break;
788+
default: effective_model_type = VLMModelType::Generic; break;
789+
}
790+
}
791+
792+
const char* system_prompt = (options && options->system_prompt) ? options->system_prompt : nullptr;
793+
713794
// Build the prompt with proper chat template formatting
714795
std::string full_prompt;
715796
bool has_image = false;
@@ -733,7 +814,8 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
733814
}
734815

735816
// Format prompt using model's built-in chat template
736-
full_prompt = format_vlm_prompt_with_template(backend->model, prompt, image_marker, has_image);
817+
full_prompt = format_vlm_prompt_with_template(backend->model, prompt, image_marker, has_image,
818+
system_prompt, effective_model_type);
737819

738820
// Tokenize and evaluate
739821
if (backend->mtmd_ctx && bitmap) {
@@ -780,7 +862,8 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
780862
#endif
781863
{
782864
// Text-only mode - still apply chat template for consistent formatting
783-
full_prompt = format_vlm_prompt_with_template(backend->model, prompt, image_marker, false);
865+
full_prompt = format_vlm_prompt_with_template(backend->model, prompt, image_marker, false,
866+
system_prompt, effective_model_type);
784867

785868
const llama_vocab* vocab = llama_model_get_vocab(backend->model);
786869
std::vector<llama_token> tokens(full_prompt.size() + 16);

sdk/runanywhere-web/packages/llamacpp/src/Infrastructure/VLMWorkerBridge.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ export type VLMWorkerCommand =
4949
type: 'process'; id: number; payload: {
5050
rgbPixels: ArrayBuffer; width: number; height: number;
5151
prompt: string; maxTokens: number; temperature: number;
52-
topP: number; systemPrompt?: string;
52+
topP: number; systemPrompt?: string; modelFamily?: number;
5353
};
5454
}
5555
| { type: 'cancel'; id: number }
@@ -99,6 +99,8 @@ export interface VLMProcessOptions {
9999
topP?: number;
100100
/** System prompt prepended to the user prompt inside the Worker. */
101101
systemPrompt?: string;
102+
/** Model family enum value (maps to rac_vlm_model_family_t). 0 = auto-detect. */
103+
modelFamily?: number;
102104
}
103105

104106
/**
@@ -291,6 +293,7 @@ export class VLMWorkerBridge {
291293
temperature: options.temperature ?? 0.7,
292294
topP: options.topP ?? 0.9,
293295
systemPrompt: options.systemPrompt,
296+
modelFamily: options.modelFamily,
294297
},
295298
[buffer],
296299
);

sdk/runanywhere-web/packages/llamacpp/src/Infrastructure/VLMWorkerRuntime.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,7 @@ async function processImage(
565565
prompt: string,
566566
maxTokens: number, temperature: number,
567567
topP: number, systemPrompt?: string,
568+
modelFamily?: number,
568569
): Promise<VLMWorkerResult> {
569570
const m = wasmModule;
570571
const pixelArray = new Uint8Array(rgbPixels);
@@ -603,6 +604,8 @@ async function processImage(
603604
m.setValue(optPtr + vo.systemPrompt, systemPromptPtr, '*');
604605
}
605606

607+
m.setValue(optPtr + vo.modelFamily, modelFamily ?? 0, 'i32');
608+
606609
const promptPtr = allocString(prompt);
607610

608611
// Result struct
@@ -679,7 +682,7 @@ function handleMessage(e: MessageEvent<VLMWorkerCommand>): void {
679682
const result = await processImage(
680683
p.rgbPixels, p.width, p.height,
681684
p.prompt, p.maxTokens, p.temperature,
682-
p.topP, p.systemPrompt,
685+
p.topP, p.systemPrompt, p.modelFamily,
683686
);
684687
self.postMessage({ id, type: 'result', payload: result } satisfies VLMWorkerResponse);
685688
break;

0 commit comments

Comments
 (0)