updates

shubhammalhotra28 · shubhammalhotra28 · commit 274fb63cce06 · 2026-03-17T21:03:05.000-07:00
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift
@@ -465,7 +465,7 @@ struct RunAnywhereAIApp: App {
             )
         }
 
-        // --- MetalRT VLM model (Qwen3-VL) ---
+        // --- MetalRT VLM models ---
 
         if let url = URL(string: "\(metalrtBase)/qwen3-vl-2b-metalrt.tar.gz") {
             RunAnywhere.registerModel(
@@ -479,6 +479,18 @@ struct RunAnywhereAIApp: App {
             )
         }
 
+        if let url = URL(string: "\(metalrtBase)/lfm25-vl-metalrt.tar.gz") {
+            RunAnywhere.registerModel(
+                id: "lfm25-vl-metalrt",
+                name: "LFM2.5-VL 1.6B (MetalRT)",
+                url: url,
+                framework: .metalrt,
+                modality: .multimodal,
+                artifactType: .archive(.tarGz, structure: .nestedDirectory),
+                memoryRequirement: 1_600_000_000
+            )
+        }
+
         logger.info("✅ MetalRT models registered")
         #else
         logger.info("ℹ️ MetalRT not available (MetalRTRuntime not linked)")
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Services/DiffusionBenchmarkProvider.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Services/DiffusionBenchmarkProvider.swift
@@ -57,9 +57,7 @@ struct DiffusionBenchmarkProvider: BenchmarkScenarioProvider {
                 guidanceScale: 0.0,
                 seed: 42
             )
-            // Note: prompt: is required by the SDK API signature, but is ignored when options is provided
-        // (the SDK uses `options ?? DiffusionGenerationOptions(prompt: prompt)`).
-        let result = try await RunAnywhere.generateImage(prompt: options.prompt, options: options)
+            let result = try await RunAnywhere.generateImage(prompt: options.prompt, options: options)
 
             metrics.endToEndLatencyMs = Date().timeIntervalSince(benchStart) * 1000
             metrics.generationTimeMs = Double(result.generationTimeMs)
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Services/VLMBenchmarkProvider.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Services/VLMBenchmarkProvider.swift
@@ -17,8 +17,7 @@ struct VLMBenchmarkProvider: BenchmarkScenarioProvider {
 
     func scenarios() -> [BenchmarkScenario] {
         [
-            BenchmarkScenario(name: "Solid Red Image", category: .vlm, parameters: ["type": "solid"]),
-            BenchmarkScenario(name: "Gradient Image", category: .vlm, parameters: ["type": "gradient"]),
+            BenchmarkScenario(name: "Image Description", category: .vlm, parameters: ["type": "gradient"]),
         ]
     }
 
@@ -31,27 +30,34 @@ struct VLMBenchmarkProvider: BenchmarkScenarioProvider {
 
         let memBefore = SyntheticInputGenerator.availableMemoryBytes()
 
-        // Load (pass ModelInfo object)
+        // Ensure clean state: unload any VLM model left over from Camera or a previous run
+        await RunAnywhere.unloadVLMModel()
+        // Also unload any lingering LLM model to free memory headroom
+        try? await RunAnywhere.unloadModel()
+        // Brief pause to let iOS reclaim GPU/Metal memory from the previous model
+        try await Task.sleep(nanoseconds: 500_000_000) // 0.5s
+
+        // Load
         let loadStart = Date()
         try await RunAnywhere.loadVLMModel(model)
         metrics.loadTimeMs = Date().timeIntervalSince(loadStart) * 1000
 
         do {
-            // Generate image
-            let image: UIImage
-            switch scenario.parameters?["type"] {
-            case "solid":
-                image = SyntheticInputGenerator.solidColorImage()
-            default:
-                image = SyntheticInputGenerator.gradientImage()
+            // Generate a small synthetic image inside an autoreleasepool so CoreGraphics
+            // intermediates are released promptly before we allocate the vision encoder.
+            let vlmImage: VLMImage = autoreleasepool {
+                let image = SyntheticInputGenerator.gradientImage()
+                return VLMImage(image: image)
             }
-            let vlmImage = VLMImage(image: image)
 
-            // Warmup
+            // Warmup: single token to prime the pipeline without large KV allocation
             let warmupStart = Date()
-            _ = try await RunAnywhere.processImage(vlmImage, prompt: "Hi", maxTokens: 5, temperature: 0.0)
+            _ = try await RunAnywhere.processImage(vlmImage, prompt: "Hi", maxTokens: 1, temperature: 0.0)
             metrics.warmupTimeMs = Date().timeIntervalSince(warmupStart) * 1000
 
+            // Cancel to flush any lingering generation state / KV cache before the real run
+            await RunAnywhere.cancelVLMGeneration()
+
             // Benchmark
             let result = try await RunAnywhere.processImage(
                 vlmImage,
@@ -68,9 +74,12 @@ struct VLMBenchmarkProvider: BenchmarkScenarioProvider {
             metrics.memoryDeltaBytes = memBefore - memAfter
 
             await RunAnywhere.unloadVLMModel()
+            // Give iOS time to release GPU/Metal buffers before the next model loads
+            try? await Task.sleep(nanoseconds: 300_000_000) // 0.3s
             return metrics
         } catch {
             await RunAnywhere.unloadVLMModel()
+            try? await Task.sleep(nanoseconds: 300_000_000)
             throw error
         }
         #else
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Views/BenchmarkDashboardView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Views/BenchmarkDashboardView.swift
@@ -248,7 +248,7 @@ private struct CategoryScenariosRow: View {
         case .tts:
             return "Short text, Medium text — measures audio duration, char throughput"
         case .vlm:
-            return "Solid color, Gradient image (224×224) — measures tok/s, completion tokens"
+            return "Gradient image (224×224) — measures tok/s, completion tokens"
         case .diffusion:
             return "Simple prompt, 10 steps, seed 42 — measures generation time"
         }
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Views/BenchmarkDetailView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Views/BenchmarkDetailView.swift
@@ -213,7 +213,9 @@ private struct MetricsGrid: View {
             if let chars = metrics.charactersProcessed { items.append(("Chars", "\(chars)")) }
         case .vlm:
             if let tps = metrics.tokensPerSecond { items.append(("tok/s", String(format: "%.1f", tps))) }
-            if let ct = metrics.completionTokens { items.append(("Tokens", "\(ct)")) }
+            if let pt = metrics.promptTokens, pt > 0 { items.append(("Prompt Tok", "\(pt)")) }
+            if let ct = metrics.completionTokens { items.append(("Comp Tok", "\(ct)")) }
+            if metrics.warmupTimeMs > 0 { items.append(("Warmup", String(format: "%.0fms", metrics.warmupTimeMs))) }
         case .diffusion:
             if let gen = metrics.generationTimeMs { items.append(("Gen", String(format: "%.0fms", gen))) }
         }
diff --git a/sdk/runanywhere-commons/src/backends/metalrt/rac_vlm_metalrt.cpp b/sdk/runanywhere-commons/src/backends/metalrt/rac_vlm_metalrt.cpp
@@ -5,16 +5,32 @@
 
 #include "rac_vlm_metalrt.h"
 
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <string>
+#include <vector>
 
 #include "metalrt_c_api.h"
 
 #include "rac/core/rac_logger.h"
 
 static const char* LOG_CAT = "VLM.MetalRT";
 
+// Expand 3-byte RGB to 4-byte RGBA (alpha=0xFF) for MetalRT's pixel API.
+static std::vector<uint8_t> rgb_to_rgba(const uint8_t* rgb, uint32_t w, uint32_t h) {
+    size_t n_pixels = (size_t)w * h;
+    std::vector<uint8_t> rgba(n_pixels * 4);
+    for (size_t i = 0; i < n_pixels; i++) {
+        rgba[i * 4 + 0] = rgb[i * 3 + 0];
+        rgba[i * 4 + 1] = rgb[i * 3 + 1];
+        rgba[i * 4 + 2] = rgb[i * 3 + 2];
+        rgba[i * 4 + 3] = 0xFF;
+    }
+    return rgba;
+}
+
 struct rac_vlm_metalrt_impl {
     void* handle;  // metalrt_vision_create() handle
     bool loaded;
@@ -65,34 +81,29 @@ rac_result_t rac_vlm_metalrt_process(rac_handle_t handle, const rac_vlm_image_t*
     auto* impl = static_cast<rac_vlm_metalrt_impl*>(handle);
     if (!impl->loaded) return RAC_ERROR_BACKEND_NOT_READY;
 
-    // MetalRT needs a file path — handle different image formats
-    const char* image_path = nullptr;
-    char tmp_path[256] = {};
-
-    if (image->format == RAC_VLM_IMAGE_FORMAT_FILE_PATH) {
-        image_path = image->file_path;
-    } else {
-        // For non-file formats, write to a temp file
-        // This is a simplification — production code would handle RGB/base64 properly
-        RAC_LOG_ERROR(LOG_CAT, "MetalRT VLM only supports FILE_PATH image format");
-        return RAC_ERROR_VALIDATION_FAILED;
-    }
-
-    if (!image_path || image_path[0] == '\0') {
-        return RAC_ERROR_NULL_POINTER;
-    }
-
     struct MetalRTVisionOptions vopts = {};
     vopts.max_tokens = options ? options->max_tokens : 256;
     vopts.temperature = options ? options->temperature : 0.0f;
     vopts.top_k = 40;
     vopts.think = false;
 
-    struct MetalRTVisionResult result = metalrt_vision_analyze(impl->handle, image_path, prompt, &vopts);
+    struct MetalRTVisionResult result = {};
+
+    if (image->format == RAC_VLM_IMAGE_FORMAT_FILE_PATH && image->file_path) {
+        result = metalrt_vision_analyze(impl->handle, image->file_path, prompt, &vopts);
+    } else if (image->format == RAC_VLM_IMAGE_FORMAT_RGB_PIXELS && image->pixel_data) {
+        auto rgba = rgb_to_rgba(image->pixel_data, image->width, image->height);
+        result = metalrt_vision_analyze_pixels(impl->handle, rgba.data(),
+                                                (int)image->width, (int)image->height,
+                                                prompt, &vopts);
+    } else {
+        RAC_LOG_ERROR(LOG_CAT, "Unsupported image format: %d", image->format);
+        return RAC_ERROR_VALIDATION_FAILED;
+    }
 
     out_result->text = result.text ? strdup(result.text) : nullptr;
     out_result->prompt_tokens = result.prompt_tokens;
-    out_result->image_tokens = 0;  // MetalRT doesn't separate image token count
+    out_result->image_tokens = 0;
     out_result->completion_tokens = result.generated_tokens;
     out_result->total_tokens = result.prompt_tokens + result.generated_tokens;
     out_result->time_to_first_token_ms = static_cast<int64_t>(result.prefill_ms);
@@ -125,20 +136,27 @@ rac_result_t rac_vlm_metalrt_process_stream(rac_handle_t handle, const rac_vlm_i
     auto* impl = static_cast<rac_vlm_metalrt_impl*>(handle);
     if (!impl->loaded) return RAC_ERROR_BACKEND_NOT_READY;
 
-    if (image->format != RAC_VLM_IMAGE_FORMAT_FILE_PATH || !image->file_path) {
-        RAC_LOG_ERROR(LOG_CAT, "MetalRT VLM only supports FILE_PATH image format");
-        return RAC_ERROR_VALIDATION_FAILED;
-    }
-
     struct MetalRTVisionOptions vopts = {};
     vopts.max_tokens = options ? options->max_tokens : 256;
     vopts.temperature = options ? options->temperature : 0.0f;
     vopts.top_k = 40;
     vopts.think = false;
 
     VLMStreamCtx ctx = {callback, user_data};
-    struct MetalRTVisionResult result = metalrt_vision_analyze_stream(
-        impl->handle, image->file_path, prompt, vlm_stream_bridge, &ctx, &vopts);
+    struct MetalRTVisionResult result = {};
+
+    if (image->format == RAC_VLM_IMAGE_FORMAT_FILE_PATH && image->file_path) {
+        result = metalrt_vision_analyze_stream(
+            impl->handle, image->file_path, prompt, vlm_stream_bridge, &ctx, &vopts);
+    } else if (image->format == RAC_VLM_IMAGE_FORMAT_RGB_PIXELS && image->pixel_data) {
+        auto rgba = rgb_to_rgba(image->pixel_data, image->width, image->height);
+        result = metalrt_vision_analyze_pixels_stream(
+            impl->handle, rgba.data(), (int)image->width, (int)image->height,
+            prompt, vlm_stream_bridge, &ctx, &vopts);
+    } else {
+        RAC_LOG_ERROR(LOG_CAT, "Unsupported image format for streaming: %d", image->format);
+        return RAC_ERROR_VALIDATION_FAILED;
+    }
 
     metalrt_vision_free_result(result);
     return RAC_SUCCESS;
diff --git a/sdk/runanywhere-commons/src/features/vlm/vlm_component.cpp b/sdk/runanywhere-commons/src/features/vlm/vlm_component.cpp
@@ -441,25 +441,29 @@ extern "C" rac_result_t rac_vlm_component_load_model_by_id(rac_handle_t handle,
         }
     }
 
-    // 3. Resolve model files within the directory
-    char model_path[1024] = {};
-    char mmproj_path[1024] = {};
-    result = rac_vlm_resolve_model_files(model_folder, model_path, sizeof(model_path), mmproj_path,
-                                         sizeof(mmproj_path));
-    if (result != RAC_SUCCESS) {
-        RAC_LOG_ERROR(LOG_CAT, "Failed to resolve model files in: %s", model_folder);
-        rac_model_info_free(model_info);
-        return result;
-    }
-
-    // 4. Delegate to the existing load function
-    const char* mmproj = mmproj_path[0] != '\0' ? mmproj_path : nullptr;
+    // 3. For directory-based models (MetalRT), pass the directory directly.
+    //    For GGUF-based models (llama.cpp), resolve .gguf + mmproj files.
     const char* name = model_info->name ? model_info->name : model_id;
 
-    RAC_LOG_INFO(LOG_CAT, "Loading VLM model by ID: %s (model=%s, mmproj=%s)", model_id, model_path,
-                 mmproj ? mmproj : "none");
+    if (rac_framework_uses_directory_based_models(model_info->framework) == RAC_TRUE) {
+        RAC_LOG_INFO(LOG_CAT, "Loading directory-based VLM model by ID: %s (dir=%s)", model_id, model_folder);
+        result = rac_vlm_component_load_model(handle, model_folder, nullptr, model_id, name);
+    } else {
+        char model_path[1024] = {};
+        char mmproj_path[1024] = {};
+        result = rac_vlm_resolve_model_files(model_folder, model_path, sizeof(model_path), mmproj_path,
+                                             sizeof(mmproj_path));
+        if (result != RAC_SUCCESS) {
+            RAC_LOG_ERROR(LOG_CAT, "Failed to resolve model files in: %s", model_folder);
+            rac_model_info_free(model_info);
+            return result;
+        }
 
-    result = rac_vlm_component_load_model(handle, model_path, mmproj, model_id, name);
+        const char* mmproj = mmproj_path[0] != '\0' ? mmproj_path : nullptr;
+        RAC_LOG_INFO(LOG_CAT, "Loading VLM model by ID: %s (model=%s, mmproj=%s)", model_id, model_path,
+                     mmproj ? mmproj : "none");
+        result = rac_vlm_component_load_model(handle, model_path, mmproj, model_id, name);
+    }
 
     rac_model_info_free(model_info);
     return result;
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Infrastructure/Download/Utilities/ArchiveUtility.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Infrastructure/Download/Utilities/ArchiveUtility.swift

Original file line number	Diff line number	Diff line change
`@@ -465,7 +465,7 @@ struct RunAnywhereAIApp: App {`
`465`	`465`	`)`
`466`	`466`	`}`
`467`	`467`
`468`		`- // --- MetalRT VLM model (Qwen3-VL) ---`
	`468`	`+ // --- MetalRT VLM models ---`
`469`	`469`
`470`	`470`	`if let url = URL(string: "\(metalrtBase)/qwen3-vl-2b-metalrt.tar.gz") {`
`471`	`471`	`RunAnywhere.registerModel(`
`@@ -479,6 +479,18 @@ struct RunAnywhereAIApp: App {`
`479`	`479`	`)`
`480`	`480`	`}`
`481`	`481`
	`482`	`+ if let url = URL(string: "\(metalrtBase)/lfm25-vl-metalrt.tar.gz") {`
	`483`	`+ RunAnywhere.registerModel(`
	`484`	`+ id: "lfm25-vl-metalrt",`
	`485`	`+ name: "LFM2.5-VL 1.6B (MetalRT)",`
	`486`	`+ url: url,`
	`487`	`+ framework: .metalrt,`
	`488`	`+ modality: .multimodal,`
	`489`	`+ artifactType: .archive(.tarGz, structure: .nestedDirectory),`
	`490`	`+ memoryRequirement: 1_600_000_000`
	`491`	`+ )`
	`492`	`+ }`
	`493`	`+`
`482`	`494`	`logger.info("✅ MetalRT models registered")`
`483`	`495`	`#else`
`484`	`496`	`logger.info("ℹ️ MetalRT not available (MetalRTRuntime not linked)")`
Original file line number	Diff line number	Diff line change
`@@ -248,7 +248,7 @@ private struct CategoryScenariosRow: View {`
`248`	`248`	`case .tts:`
`249`	`249`	`return "Short text, Medium text — measures audio duration, char throughput"`
`250`	`250`	`case .vlm:`
`251`		`- return "Solid color, Gradient image (224×224) — measures tok/s, completion tokens"`
	`251`	`+ return "Gradient image (224×224) — measures tok/s, completion tokens"`
`252`	`252`	`case .diffusion:`
`253`	`253`	`return "Simple prompt, 10 steps, seed 42 — measures generation time"`
`254`	`254`	`}`