Refactor VLM model type resolution and system prompt handling

sanchitmonga22 · sanchitmonga22 · commit c0a15338c2d0 · 2026-03-05T00:49:10.000-08:00
- Updated the logic for determining the effective system prompt to handle empty strings.
- Introduced a new function to resolve the effective VLM model type based on options, simplifying the code in `rac_vlm_llamacpp_process`.
- Improved download progress tracking in `ModelDownloader` by using cumulative byte counts for better accuracy.
- Enhanced type exports in `llamacpp` and `onnx` packages for better compatibility and clarity.
- Adjusted regex for identifying Qwen VL models to ensure more accurate matching.
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_vlm_llamacpp.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_vlm_llamacpp.cpp
@@ -178,7 +178,7 @@ std::string format_vlm_prompt_with_template(llama_model* model, const std::strin
     }
 
     // Resolve system prompt: use explicit value, or inject a default for Qwen2-VL
-    const char* effective_system = system_prompt;
+    const char* effective_system = (system_prompt && system_prompt[0] != '\0') ? system_prompt : nullptr;
     if (!effective_system && model_type == VLMModelType::Qwen2VL) {
         effective_system = "You are a helpful assistant.";
     }
@@ -208,15 +208,14 @@ std::string format_vlm_prompt_with_template(llama_model* model, const std::strin
                     return formatted;
                 }
             }
-            bool has_explicit_system = (system_prompt && system_prompt[0] != '\0');
-            if (has_explicit_system) {
+            if (effective_system) {
                 RAC_LOG_WARNING(LOG_CAT, "Template with system failed (size=%d); falling back to manual to preserve explicit system prompt", size);
             } else {
                 RAC_LOG_WARNING(LOG_CAT, "llama_chat_apply_template with system failed (size=%d), trying without", size);
             }
             // If the caller passed an explicit system prompt, skip user-only
             // template to avoid silently dropping it -- go straight to manual.
-            if (has_explicit_system) {
+            if (effective_system) {
                 goto manual_fallback;
             }
         }
@@ -376,6 +375,22 @@ void configure_sampler(LlamaCppVLMBackend* backend, const rac_vlm_options_t* opt
                  temperature, top_p);
 }
 
+/**
+ * Resolve the effective VLM model type from options override or auto-detected default.
+ */
+static VLMModelType resolve_effective_model_type(VLMModelType detected,
+                                                  const rac_vlm_options_t* options) {
+    if (options && options->model_family != RAC_VLM_MODEL_FAMILY_AUTO) {
+        switch (options->model_family) {
+            case RAC_VLM_MODEL_FAMILY_QWEN2_VL: return VLMModelType::Qwen2VL;
+            case RAC_VLM_MODEL_FAMILY_SMOLVLM:  return VLMModelType::SmolVLM;
+            case RAC_VLM_MODEL_FAMILY_LLAVA:     return VLMModelType::LLaVA;
+            default:                             return VLMModelType::Generic;
+        }
+    }
+    return detected;
+}
+
 }  // namespace
 
 // =============================================================================
@@ -629,15 +644,7 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
     backend->n_past = 0;
 
     // Resolve effective model type: options override > auto-detected at load time
-    VLMModelType effective_model_type = backend->model_type;
-    if (options && options->model_family != RAC_VLM_MODEL_FAMILY_AUTO) {
-        switch (options->model_family) {
-            case RAC_VLM_MODEL_FAMILY_QWEN2_VL: effective_model_type = VLMModelType::Qwen2VL; break;
-            case RAC_VLM_MODEL_FAMILY_SMOLVLM:  effective_model_type = VLMModelType::SmolVLM; break;
-            case RAC_VLM_MODEL_FAMILY_LLAVA:     effective_model_type = VLMModelType::LLaVA;   break;
-            default:                             effective_model_type = VLMModelType::Generic;  break;
-        }
-    }
+    VLMModelType effective_model_type = resolve_effective_model_type(backend->model_type, options);
 
     const char* system_prompt = (options && options->system_prompt) ? options->system_prompt : nullptr;
 
@@ -772,6 +779,7 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
 
     for (int i = 0; i < max_tokens && !backend->cancel_requested; i++) {
         // Diagnostic: on first token, inspect logits for NaN/corruption
+#ifdef RAC_VLM_ENABLE_DIAGNOSTICS
         if (i == 0) {
             float* logits = llama_get_logits(backend->ctx);
             int n_vocab = llama_vocab_n_tokens(vocab);
@@ -806,6 +814,7 @@ rac_result_t rac_vlm_llamacpp_process(rac_handle_t handle, const rac_vlm_image_t
                               top5_idx[4], top5_val[4]);
             }
         }
+#endif
 
         llama_token token = llama_sampler_sample(backend->sampler, backend->ctx, -1);
         llama_sampler_accept(backend->sampler, token);
@@ -889,15 +898,7 @@ rac_result_t rac_vlm_llamacpp_process_stream(rac_handle_t handle, const rac_vlm_
     RAC_LOG_DEBUG(LOG_CAT, "Cleared KV cache for new request");
 
     // Resolve effective model type: options override > auto-detected at load time
-    VLMModelType effective_model_type = backend->model_type;
-    if (options && options->model_family != RAC_VLM_MODEL_FAMILY_AUTO) {
-        switch (options->model_family) {
-            case RAC_VLM_MODEL_FAMILY_QWEN2_VL: effective_model_type = VLMModelType::Qwen2VL; break;
-            case RAC_VLM_MODEL_FAMILY_SMOLVLM:  effective_model_type = VLMModelType::SmolVLM; break;
-            case RAC_VLM_MODEL_FAMILY_LLAVA:     effective_model_type = VLMModelType::LLaVA;   break;
-            default:                             effective_model_type = VLMModelType::Generic;  break;
-        }
-    }
+    VLMModelType effective_model_type = resolve_effective_model_type(backend->model_type, options);
 
     const char* system_prompt = (options && options->system_prompt) ? options->system_prompt : nullptr;
 
diff --git a/sdk/runanywhere-web/packages/core/src/Infrastructure/ModelDownloader.ts b/sdk/runanywhere-web/packages/core/src/Infrastructure/ModelDownloader.ts
@@ -221,22 +221,21 @@ export class ModelDownloader {
 
     try {
       const totalFiles = 1 + (model.additionalFiles?.length ?? 0);
-      let totalBytesDownloaded = 0;
-      let totalBytesExpected = 0;
+      let cumulativeBytesDownloaded = 0;
+      let cumulativeBytesExpected = 0;
+      const completedFileSizes: number[] = [];
 
-      // Try streaming the primary file directly to storage (keeps memory constant).
-      // Falls back to buffered download + store if streaming is not possible.
       const primaryProgressCb = (progress: number, bytesDown: number, bytesTotal: number) => {
-        totalBytesDownloaded = bytesDown;
-        totalBytesExpected = bytesTotal * totalFiles;
+        cumulativeBytesDownloaded = bytesDown;
+        cumulativeBytesExpected = bytesTotal * totalFiles;
         const overallProgress = progress / totalFiles;
         this.registry.updateModel(modelId, { downloadProgress: overallProgress });
         this.emitDownloadProgress({
           modelId,
           stage: DownloadStage.Downloading,
           progress: overallProgress,
-          bytesDownloaded: totalBytesDownloaded,
-          totalBytes: totalBytesExpected,
+          bytesDownloaded: cumulativeBytesDownloaded,
+          totalBytes: cumulativeBytesExpected,
           currentFile: model.url.split('/').pop(),
           filesCompleted: 0,
           filesTotal: totalFiles,
@@ -249,13 +248,18 @@ export class ModelDownloader {
         await this.storeInOPFS(modelId, primaryData);
         primarySize = primaryData.length;
       }
+      completedFileSizes.push(primarySize);
 
       // Download additional files (e.g., mmproj for VLM)
       if (model.additionalFiles && model.additionalFiles.length > 0) {
         for (let i = 0; i < model.additionalFiles.length; i++) {
           const file = model.additionalFiles[i];
           const fileKey = this.additionalFileKey(modelId, file.filename);
+          const priorCompleted = completedFileSizes.reduce((a, b) => a + b, 0);
+
           const fileProgressCb = (progress: number, bytesDown: number, bytesTotal: number) => {
+            cumulativeBytesDownloaded = priorCompleted + bytesDown;
+            cumulativeBytesExpected = priorCompleted + bytesTotal;
             const baseProgress = (1 + i) / totalFiles;
             const fileProgress = progress / totalFiles;
             const overallProgress = baseProgress + fileProgress;
@@ -264,42 +268,40 @@ export class ModelDownloader {
               modelId,
               stage: DownloadStage.Downloading,
               progress: overallProgress,
-              bytesDownloaded: bytesDown,
-              totalBytes: bytesTotal,
+              bytesDownloaded: cumulativeBytesDownloaded,
+              totalBytes: cumulativeBytesExpected,
               currentFile: file.filename,
               filesCompleted: 1 + i,
               filesTotal: totalFiles,
             });
           };
 
+          let fileSize: number;
           const streamedSize = await this.downloadAndStoreStreaming(file.url, fileKey, fileProgressCb);
           if (streamedSize === null) {
             const fileData = await this.downloadFile(file.url, fileProgressCb);
             await this.storeInOPFS(fileKey, fileData);
+            fileSize = fileData.length;
+          } else {
+            fileSize = streamedSize;
           }
+          completedFileSizes.push(fileSize);
         }
       }
 
+      const totalSize = completedFileSizes.reduce((a, b) => a + b, 0);
+
       // Validating stage
       this.emitDownloadProgress({
         modelId,
         stage: DownloadStage.Validating,
         progress: 0.95,
-        bytesDownloaded: totalBytesDownloaded,
-        totalBytes: totalBytesExpected,
+        bytesDownloaded: totalSize,
+        totalBytes: totalSize,
         filesCompleted: totalFiles,
         filesTotal: totalFiles,
       });
 
-      let totalSize = primarySize;
-      if (model.additionalFiles) {
-        for (const file of model.additionalFiles) {
-          const fileKey = this.additionalFileKey(modelId, file.filename);
-          const size = await this.storage.getFileSize(fileKey);
-          if (size !== null) totalSize += size;
-        }
-      }
-
       this.registry.updateModel(modelId, {
         status: ModelStatus.Downloaded,
         downloadProgress: 1,
diff --git a/sdk/runanywhere-web/packages/llamacpp/src/Infrastructure/VLMWorkerBridge.ts b/sdk/runanywhere-web/packages/llamacpp/src/Infrastructure/VLMWorkerBridge.ts
@@ -251,7 +251,7 @@ export class VLMWorkerBridge {
     // TODO: re-test on WebGPU periodically as llama.cpp's WebGPU backend
     // matures — the Vulkan fp16 FA fix (b8168) may eventually be ported.
     const bridge = LlamaCppBridge.shared;
-    const isQwenVL = /qwen/i.test(params.modelId) || /qwen/i.test(params.modelName);
+    const isQwenVL = /qwen.*vl/i.test(params.modelId) || /qwen.*vl/i.test(params.modelName);
     if (isQwenVL && bridge.accelerationMode === 'webgpu') {
       const currentUrl = bridge.wasmUrl ?? '';
       const cpuUrl = currentUrl.replace(/-webgpu\.js$/, '.js');
diff --git a/sdk/runanywhere-web/packages/llamacpp/src/Infrastructure/VLMWorkerRuntime.ts b/sdk/runanywhere-web/packages/llamacpp/src/Infrastructure/VLMWorkerRuntime.ts
@@ -47,14 +47,22 @@ let offsets: AllOffsets | null = null;
 // module's _rac_wasm_offsetof_* / _rac_wasm_sizeof_* exports.
 // ---------------------------------------------------------------------------
 
-function workerOffsetOf(m: any, name: string): number {
+function workerOffsetOf(m: any, name: string, required = true): number {
   const fn = m[`_rac_wasm_offsetof_${name}`];
-  return typeof fn === 'function' ? fn() : 0;
+  if (typeof fn === 'function') return fn();
+  if (required) {
+    throw new Error(`Missing WASM offsetof export: _rac_wasm_offsetof_${name} — ABI mismatch between WASM binary and TS`);
+  }
+  return 0;
 }
 
-function workerSizeOf(m: any, name: string): number {
+function workerSizeOf(m: any, name: string, required = true): number {
   const fn = m[`_rac_wasm_sizeof_${name}`];
-  return typeof fn === 'function' ? fn() : 0;
+  if (typeof fn === 'function') return fn();
+  if (required) {
+    throw new Error(`Missing WASM sizeof export: _rac_wasm_sizeof_${name} — ABI mismatch between WASM binary and TS`);
+  }
+  return 0;
 }
 
 function loadOffsetsFromModule(m: any): AllOffsets {
diff --git a/sdk/runanywhere-web/packages/llamacpp/src/index.ts b/sdk/runanywhere-web/packages/llamacpp/src/index.ts
@@ -32,6 +32,8 @@ export type { LlamaCppModule } from './Foundation/LlamaCppBridge';
 // Extensions (backend-specific implementations)
 export { TextGeneration } from './Extensions/RunAnywhere+TextGeneration';
 export { VLM, VLMModelFamily } from './Extensions/RunAnywhere+VLM';
+export { VLMImageFormat } from './Extensions/VLMTypes';
+export type { VLMImage, VLMGenerationOptions, VLMGenerationResult, VLMStreamingResult } from './Extensions/VLMTypes';
 export { ToolCalling, ToolCallFormat, toToolValue, fromToolValue, getStringArg, getNumberArg } from './Extensions/RunAnywhere+ToolCalling';
 export type {
   ToolValue, ToolParameterType, ToolParameter, ToolDefinition,
diff --git a/sdk/runanywhere-web/packages/onnx/src/index.ts b/sdk/runanywhere-web/packages/onnx/src/index.ts
@@ -33,5 +33,14 @@ export type { TTSVoiceConfig } from './Extensions/RunAnywhere+TTS';
 export { VAD } from './Extensions/RunAnywhere+VAD';
 export type { VADModelConfig } from './Extensions/RunAnywhere+VAD';
 
+// Backward-compatible re-exports of shared contract types
+export type {
+  STTTranscriptionResult, STTWord, STTTranscribeOptions,
+  STTStreamCallback, STTStreamingSession,
+  TTSSynthesisResult, TTSSynthesizeOptions,
+  SpeechActivityCallback, SpeechSegment,
+} from '@runanywhere/web';
+export { SpeechActivity } from '@runanywhere/web';
+
 // Foundation
 export { SherpaONNXBridge } from './Foundation/SherpaONNXBridge';