@@ -161,9 +161,14 @@ VLMModelType detect_vlm_model_type(llama_model* model) {
161161/* *
162162 * Format prompt using model's built-in chat template via llama_chat_apply_template.
163163 * Falls back to manual formatting if template application fails.
164+ *
165+ * When system_prompt is provided, it is prepended as a system message.
166+ * For models that expect a system message (e.g. Qwen2-VL), a default is
167+ * injected based on the detected model_type when no explicit prompt is given.
164168 */
165169std::string format_vlm_prompt_with_template (llama_model* model, const std::string& user_prompt,
166- const char * image_marker, bool has_image) {
170+ const char * image_marker, bool has_image,
171+ const char * system_prompt, VLMModelType model_type) {
167172 // Build user content with image marker if present
168173 std::string user_content;
169174 if (has_image) {
@@ -172,18 +177,44 @@ std::string format_vlm_prompt_with_template(llama_model* model, const std::strin
172177 user_content = user_prompt;
173178 }
174179
180+ // Resolve system prompt: use explicit value, or inject a default for Qwen2-VL
181+ const char * effective_system = system_prompt;
182+ if (!effective_system && model_type == VLMModelType::Qwen2VL) {
183+ effective_system = " You are a helpful assistant." ;
184+ }
185+
175186 // Get the model's chat template
176187 const char * tmpl = llama_model_chat_template (model, nullptr );
177188
178189 // Try to use llama_chat_apply_template
179190 if (tmpl) {
180191 RAC_LOG_DEBUG (LOG_CAT, " Using model chat template: %.80s..." , tmpl);
181192
193+ if (effective_system) {
194+ llama_chat_message messages[2 ];
195+ messages[0 ].role = " system" ;
196+ messages[0 ].content = effective_system;
197+ messages[1 ].role = " user" ;
198+ messages[1 ].content = user_content.c_str ();
199+
200+ int32_t size = llama_chat_apply_template (tmpl, messages, 2 , true , nullptr , 0 );
201+ if (size > 0 ) {
202+ std::vector<char > buf (size + 1 );
203+ int32_t result = llama_chat_apply_template (tmpl, messages, 2 , true , buf.data (), buf.size ());
204+ if (result > 0 ) {
205+ std::string formatted (buf.data (), result);
206+ RAC_LOG_DEBUG (LOG_CAT, " Template-formatted prompt with system (%d chars): %s" ,
207+ (int )formatted.length (), formatted.c_str ());
208+ return formatted;
209+ }
210+ }
211+ RAC_LOG_WARNING (LOG_CAT, " llama_chat_apply_template with system failed (size=%d), trying without" , size);
212+ }
213+
182214 llama_chat_message messages[1 ];
183215 messages[0 ].role = " user" ;
184216 messages[0 ].content = user_content.c_str ();
185217
186- // First call to get required buffer size
187218 int32_t size = llama_chat_apply_template (tmpl, messages, 1 , true , nullptr , 0 );
188219 if (size > 0 ) {
189220 std::vector<char > buf (size + 1 );
@@ -201,7 +232,13 @@ std::string format_vlm_prompt_with_template(llama_model* model, const std::strin
201232 }
202233
203234 // Fallback: manual chatml format (works for most models)
204- std::string formatted = " <|im_start|>user\n " ;
235+ std::string formatted;
236+ if (effective_system) {
237+ formatted = " <|im_start|>system\n " ;
238+ formatted += effective_system;
239+ formatted += " <|im_end|>\n " ;
240+ }
241+ formatted += " <|im_start|>user\n " ;
205242 formatted += user_content;
206243 formatted += " <|im_end|>\n <|im_start|>assistant\n " ;
207244
@@ -300,14 +337,30 @@ void configure_sampler(LlamaCppVLMBackend* backend, const rac_vlm_options_t* opt
300337 }
301338 }
302339
303- // Build new sampler chain
340+ // Build new sampler chain.
341+ // Order follows llama.cpp common_sampler_init: penalties → DRY → top_p → min_p → temp → dist.
342+ // Penalties and DRY must be applied to raw logits before temperature softens them.
304343 llama_sampler_chain_params sampler_params = llama_sampler_chain_default_params ();
305344 backend->sampler = llama_sampler_chain_init (sampler_params);
306- llama_sampler_chain_add (backend->sampler , llama_sampler_init_temp (temperature));
345+
346+ // Token-level repetition penalty + frequency/presence penalties
347+ llama_sampler_chain_add (backend->sampler , llama_sampler_init_penalties (256 , 1 .3f , 0 .1f , 0 .1f ));
348+
349+ // DRY sampler: catches n-gram (sequence) repetition like "gó gó gó" where individual
350+ // tokens may alternate. Multiplier=0.8, base=1.75, allowed_length=2, last_n=256.
351+ const llama_vocab* vocab = llama_model_get_vocab (backend->model );
352+ static const char * dry_breakers[] = { " \n " , " :" , " \" " , " *" };
353+ llama_sampler_chain_add (backend->sampler , llama_sampler_init_dry (
354+ vocab, llama_model_n_ctx_train (backend->model ),
355+ 0 .8f , 1 .75f , 2 , 256 , dry_breakers, 4 ));
356+
307357 llama_sampler_chain_add (backend->sampler , llama_sampler_init_top_p (top_p, 1 ));
358+ llama_sampler_chain_add (backend->sampler , llama_sampler_init_min_p (0 .1f , 1 ));
359+ llama_sampler_chain_add (backend->sampler , llama_sampler_init_temp (temperature));
308360 llama_sampler_chain_add (backend->sampler , llama_sampler_init_dist (LLAMA_DEFAULT_SEED));
309361
310- RAC_LOG_DEBUG (LOG_CAT, " Sampler configured: temp=%.2f, top_p=%.2f" , temperature, top_p);
362+ RAC_LOG_INFO (LOG_CAT, " Sampler configured: temp=%.2f, top_p=%.2f, repeat=1.3, freq=0.1, pres=0.1, DRY=0.8, min_p=0.1" ,
363+ temperature, top_p);
311364}
312365
313366} // namespace
@@ -524,6 +577,19 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
524577 }
525578 backend->n_past = 0 ;
526579
580+ // Resolve effective model type: options override > auto-detected at load time
581+ VLMModelType effective_model_type = backend->model_type ;
582+ if (options && options->model_family != RAC_VLM_MODEL_FAMILY_AUTO) {
583+ switch (options->model_family ) {
584+ case RAC_VLM_MODEL_FAMILY_QWEN2_VL: effective_model_type = VLMModelType::Qwen2VL; break ;
585+ case RAC_VLM_MODEL_FAMILY_SMOLVLM: effective_model_type = VLMModelType::SmolVLM; break ;
586+ case RAC_VLM_MODEL_FAMILY_LLAVA: effective_model_type = VLMModelType::LLaVA; break ;
587+ default : effective_model_type = VLMModelType::Generic; break ;
588+ }
589+ }
590+
591+ const char * system_prompt = (options && options->system_prompt ) ? options->system_prompt : nullptr ;
592+
527593 // Build the prompt with proper chat template formatting
528594 std::string full_prompt;
529595 bool has_image = false ;
@@ -552,7 +618,8 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
552618 }
553619
554620 // Format prompt using model's built-in chat template
555- full_prompt = format_vlm_prompt_with_template (backend->model , prompt, image_marker, has_image);
621+ full_prompt = format_vlm_prompt_with_template (backend->model , prompt, image_marker, has_image,
622+ system_prompt, effective_model_type);
556623
557624 // Tokenize and evaluate
558625 if (backend->mtmd_ctx && bitmap) {
@@ -599,7 +666,8 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
599666#endif
600667 {
601668 // Text-only mode - still apply chat template for consistent formatting
602- full_prompt = format_vlm_prompt_with_template (backend->model , prompt, image_marker, false );
669+ full_prompt = format_vlm_prompt_with_template (backend->model , prompt, image_marker, false ,
670+ system_prompt, effective_model_type);
603671
604672 const llama_vocab* vocab = llama_model_get_vocab (backend->model );
605673 std::vector<llama_token> tokens (full_prompt.size () + 16 );
@@ -710,6 +778,19 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
710778 backend->n_past = 0 ;
711779 RAC_LOG_DEBUG (LOG_CAT, " Cleared KV cache for new request" );
712780
781+ // Resolve effective model type: options override > auto-detected at load time
782+ VLMModelType effective_model_type = backend->model_type ;
783+ if (options && options->model_family != RAC_VLM_MODEL_FAMILY_AUTO) {
784+ switch (options->model_family ) {
785+ case RAC_VLM_MODEL_FAMILY_QWEN2_VL: effective_model_type = VLMModelType::Qwen2VL; break ;
786+ case RAC_VLM_MODEL_FAMILY_SMOLVLM: effective_model_type = VLMModelType::SmolVLM; break ;
787+ case RAC_VLM_MODEL_FAMILY_LLAVA: effective_model_type = VLMModelType::LLaVA; break ;
788+ default : effective_model_type = VLMModelType::Generic; break ;
789+ }
790+ }
791+
792+ const char * system_prompt = (options && options->system_prompt ) ? options->system_prompt : nullptr ;
793+
713794 // Build the prompt with proper chat template formatting
714795 std::string full_prompt;
715796 bool has_image = false ;
@@ -733,7 +814,8 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
733814 }
734815
735816 // Format prompt using model's built-in chat template
736- full_prompt = format_vlm_prompt_with_template (backend->model , prompt, image_marker, has_image);
817+ full_prompt = format_vlm_prompt_with_template (backend->model , prompt, image_marker, has_image,
818+ system_prompt, effective_model_type);
737819
738820 // Tokenize and evaluate
739821 if (backend->mtmd_ctx && bitmap) {
@@ -780,7 +862,8 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
780862#endif
781863 {
782864 // Text-only mode - still apply chat template for consistent formatting
783- full_prompt = format_vlm_prompt_with_template (backend->model , prompt, image_marker, false );
865+ full_prompt = format_vlm_prompt_with_template (backend->model , prompt, image_marker, false ,
866+ system_prompt, effective_model_type);
784867
785868 const llama_vocab* vocab = llama_model_get_vocab (backend->model );
786869 std::vector<llama_token> tokens (full_prompt.size () + 16 );
0 commit comments