Skip to content

Commit 6e6c19e

Browse files
feat: integrate MetalRT backend for iOS with benchmark improvements
MetalRT Backend Integration: - Extend build-ios.sh to compile MetalRT engine, build Metal shaders, and create RABackendMetalRT.xcframework (device-only) - Add MetalRT binary target and runtime resources to Package.swift - Register MetalRT LLM/STT/TTS/VLM models in the iOS example app with HuggingFace-hosted .tar.gz model archives - Add RAC_FRAMEWORK_METALRT support across model paths, registry, discovery, and service dispatch (LLM, STT, TTS, VLM) - Fix nested model directory resolution in MetalRT backend adapter - Add MetalRT to is_model_file callback and directory-based model detection for proper persistence across app restarts C++ Infrastructure Fixes: - Fix lifecycle_manager to auto-unload previous model before loading a new one, preventing GPU resource leaks when switching backends - Add max_tokens enforcement at the component-level streaming callback - Use actual streaming callback count for completion_tokens instead of character-length estimate - Add client-side max_tokens cap in MetalRT streaming bridge Benchmark Improvements: - Add prefill tok/s, decode tok/s, warmup time, input/output token metrics to benchmark results and detail view - Add model selection UI to benchmark dashboard so users can pick which models to benchmark for fair comparisons - Add explicit model unload before load in LLM benchmark provider - Update benchmark prompts for more consistent token output - Extract prompt_tokens from C++ streaming result for accurate prefill calculation Made-with: Cursor
1 parent ed7eb57 commit 6e6c19e

28 files changed

Lines changed: 705 additions & 74 deletions

File tree

Package.swift

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ let package = Package(
213213
// =================================================================
214214
.target(
215215
name: "MetalRTBackend",
216-
dependencies: [],
216+
dependencies: ["RABackendMetalRTBinary"],
217217
path: "sdk/runanywhere-swift/Sources/MetalRTRuntime/include",
218218
publicHeadersPath: "."
219219
),
@@ -226,13 +226,19 @@ let package = Package(
226226
dependencies: [
227227
"RunAnywhere",
228228
"MetalRTBackend",
229+
"RABackendMetalRTBinary",
229230
],
230231
path: "sdk/runanywhere-swift/Sources/MetalRTRuntime",
231232
exclude: ["include"],
233+
resources: [
234+
.copy("Resources/default.metallib"),
235+
],
232236
linkerSettings: [
233237
.linkedLibrary("c++"),
234238
.linkedFramework("Accelerate"),
235239
.linkedFramework("Metal"),
240+
.linkedFramework("CoreGraphics"),
241+
.linkedFramework("ImageIO"),
236242
]
237243
),
238244

@@ -291,6 +297,10 @@ func binaryTargets() -> [Target] {
291297
name: "RABackendONNXBinary",
292298
path: "sdk/runanywhere-swift/Binaries/RABackendONNX.xcframework"
293299
),
300+
.binaryTarget(
301+
name: "RABackendMetalRTBinary",
302+
path: "sdk/runanywhere-swift/Binaries/RABackendMetalRT.xcframework"
303+
),
294304
]
295305

296306
// ONNX Runtime xcframeworks - split by platform
@@ -330,6 +340,11 @@ func binaryTargets() -> [Target] {
330340
url: "https://github.com/RunanywhereAI/runanywhere-sdks/releases/download/v\(sdkVersion)/RABackendONNX-v\(sdkVersion).zip",
331341
checksum: "809e2510da49f71f6d019e77bcc0a7e12e967f3b739ba0b9eea7adb77936edc0"
332342
),
343+
.binaryTarget(
344+
name: "RABackendMetalRTBinary",
345+
url: "https://github.com/RunanywhereAI/runanywhere-sdks/releases/download/v\(sdkVersion)/RABackendMetalRT-v\(sdkVersion).zip",
346+
checksum: "0000000000000000000000000000000000000000000000000000000000000000"
347+
),
333348
.binaryTarget(
334349
name: "ONNXRuntimeiOSBinary",
335350
url: "https://github.com/RunanywhereAI/runanywhere-sdks/releases/download/v\(sdkVersion)/onnxruntime-ios-v\(sdkVersion).zip",

examples/ios/RunAnywhereAI/RunAnywhereAI.xcodeproj/project.pbxproj

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
58ABEDD22ED16DA40058D033 /* RunAnywhereONNX in Frameworks */ = {isa = PBXBuildFile; productRef = 58ABEDD12ED16DA40058D033 /* RunAnywhereONNX */; };
1616
58LLAMACPP12ED16DA40058D0 /* RunAnywhereLlamaCPP in Frameworks */ = {isa = PBXBuildFile; productRef = 58LLAMACPP02ED16DA40058D0 /* RunAnywhereLlamaCPP */; };
1717
58WHISPERKIT1ED16DA40058D0 /* RunAnywhereWhisperKit in Frameworks */ = {isa = PBXBuildFile; productRef = 58WHISPERKIT0ED16DA40058D0 /* RunAnywhereWhisperKit */; };
18+
58METALRT12ED16DA40058D0 /* RunAnywhereMetalRT in Frameworks */ = {isa = PBXBuildFile; productRef = 58METALRT02ED16DA40058D0 /* RunAnywhereMetalRT */; };
1819
RACACTIVITY01ACTIVITY01RAC /* DictationActivityAttributes.swift in Sources */ = {isa = PBXBuildFile; fileRef = RACACTIVITY02ACTIVITY02RAC /* DictationActivityAttributes.swift */; };
1920
RACSHARED01RACSHARED01RACS /* SharedConstants.swift in Sources */ = {isa = PBXBuildFile; fileRef = RACSHARED02RACSHARED02RACS /* SharedConstants.swift */; };
2021
RACSHARED03RACSHARED03RACS /* SharedDataBridge.swift in Sources */ = {isa = PBXBuildFile; fileRef = RACSHARED04RACSHARED04RACS /* SharedDataBridge.swift */; };
@@ -169,6 +170,7 @@
169170
541C59DA2E63772A00DD7839 /* RunAnywhere in Frameworks */,
170171
58LLAMACPP12ED16DA40058D0 /* RunAnywhereLlamaCPP in Frameworks */,
171172
58WHISPERKIT1ED16DA40058D0 /* RunAnywhereWhisperKit in Frameworks */,
173+
58METALRT12ED16DA40058D0 /* RunAnywhereMetalRT in Frameworks */,
172174
);
173175
runOnlyForDeploymentPostprocessing = 0;
174176
};
@@ -309,6 +311,7 @@
309311
58ABEDD12ED16DA40058D033 /* RunAnywhereONNX */,
310312
58LLAMACPP02ED16DA40058D0 /* RunAnywhereLlamaCPP */,
311313
58WHISPERKIT0ED16DA40058D0 /* RunAnywhereWhisperKit */,
314+
58METALRT02ED16DA40058D0 /* RunAnywhereMetalRT */,
312315
);
313316
productName = RunAnywhereAI;
314317
productReference = 5480A1F02E2F250200337F2F /* RunAnywhereAI.app */;
@@ -1080,6 +1083,11 @@
10801083
package = 58E021172E52A86000B722EF /* XCLocalSwiftPackageReference "../../.." */;
10811084
productName = RunAnywhereWhisperKit;
10821085
};
1086+
58METALRT02ED16DA40058D0 /* RunAnywhereMetalRT */ = {
1087+
isa = XCSwiftPackageProductDependency;
1088+
package = 58E021172E52A86000B722EF /* XCLocalSwiftPackageReference "../../.." */;
1089+
productName = RunAnywhereMetalRT;
1090+
};
10831091
/* End XCSwiftPackageProductDependency section */
10841092
};
10851093
rootObject = 5480A1E82E2F250200337F2F /* Project object */;

examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift

Lines changed: 130 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,26 @@ struct RunAnywhereAIApp: App {
273273
)
274274
}
275275

276+
// LFM2.5-1.2B-Instruct - General-purpose instruction-tuned LFM (Liquid AI)
277+
if let lfm25InstructQ4URL = URL(string: "https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF/resolve/main/LFM2.5-1.2B-Instruct-Q4_K_M.gguf") {
278+
RunAnywhere.registerModel(
279+
id: "lfm25-1.2b-instruct-q4_k_m",
280+
name: "LiquidAI LFM2.5 1.2B Instruct Q4_K_M",
281+
url: lfm25InstructQ4URL,
282+
framework: .llamaCpp,
283+
memoryRequirement: 900_000_000
284+
)
285+
}
286+
if let lfm25InstructQ8URL = URL(string: "https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF/resolve/main/LFM2.5-1.2B-Instruct-Q8_0.gguf") {
287+
RunAnywhere.registerModel(
288+
id: "lfm25-1.2b-instruct-q8_0",
289+
name: "LiquidAI LFM2.5 1.2B Instruct Q8_0",
290+
url: lfm25InstructQ8URL,
291+
framework: .llamaCpp,
292+
memoryRequirement: 1_400_000_000
293+
)
294+
}
295+
276296
// Qwen3 models
277297
if let qwen3_06bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf") {
278298
RunAnywhere.registerModel(
@@ -334,24 +354,118 @@ struct RunAnywhereAIApp: App {
334354
logger.info("✅ LLM models registered (including tool-calling optimized models)")
335355

336356
// ============================================================================
337-
// Register MetalRT LLM models (custom Metal GPU kernels, framework-hint only)
357+
// Register MetalRT models (custom Metal GPU kernels, framework-hint only)
338358
// These models use MetalRT's safetensors format, NOT GGUF.
359+
// Models are from runanywhere/ HuggingFace org, packaged as tar.gz archives.
339360
// ============================================================================
340-
// TODO: Add MetalRT model download URLs once hosted
341-
// For now, models are loaded from local paths during development.
342-
// Example registration (uncomment when URLs are available):
343-
//
344-
// if let qwen3MetalRTURL = URL(string: "https://huggingface.co/.../Qwen3-0.6B-MLX-4bit.tar.gz") {
345-
// RunAnywhere.registerModel(
346-
// id: "qwen3-0.6b-metalrt",
347-
// name: "Qwen3 0.6B (MetalRT)",
348-
// url: qwen3MetalRTURL,
349-
// framework: .metalrt,
350-
// memoryRequirement: 400_000_000
351-
// )
352-
// }
353-
354-
logger.info("✅ MetalRT models registered (framework-hint only)")
361+
#if canImport(MetalRTRuntime)
362+
363+
// --- MetalRT LLM models ---
364+
// All MetalRT iOS models are hosted at: huggingface.co/runanywhere/metalrt-ios
365+
let metalrtBase = "https://huggingface.co/runanywhere/metalrt-ios/resolve/main"
366+
367+
if let url = URL(string: "\(metalrtBase)/qwen3-0.6b-metalrt.tar.gz") {
368+
RunAnywhere.registerModel(
369+
id: "qwen3-0.6b-metalrt",
370+
name: "Qwen3 0.6B (MetalRT)",
371+
url: url,
372+
framework: .metalrt,
373+
artifactType: .archive(.tarGz, structure: .nestedDirectory),
374+
memoryRequirement: 400_000_000
375+
)
376+
}
377+
378+
if let url = URL(string: "\(metalrtBase)/qwen3-4b-metalrt.tar.gz") {
379+
RunAnywhere.registerModel(
380+
id: "qwen3-4b-metalrt",
381+
name: "Qwen3 4B (MetalRT)",
382+
url: url,
383+
framework: .metalrt,
384+
artifactType: .archive(.tarGz, structure: .nestedDirectory),
385+
memoryRequirement: 2_500_000_000
386+
)
387+
}
388+
389+
if let url = URL(string: "\(metalrtBase)/llama3-3b-metalrt.tar.gz") {
390+
RunAnywhere.registerModel(
391+
id: "llama3-3b-metalrt",
392+
name: "Llama 3.2 3B (MetalRT)",
393+
url: url,
394+
framework: .metalrt,
395+
artifactType: .archive(.tarGz, structure: .nestedDirectory),
396+
memoryRequirement: 1_800_000_000
397+
)
398+
}
399+
400+
if let url = URL(string: "\(metalrtBase)/lfm25-1.2b-metalrt.tar.gz") {
401+
RunAnywhere.registerModel(
402+
id: "lfm25-1.2b-metalrt",
403+
name: "LFM 2.5 1.2B (MetalRT)",
404+
url: url,
405+
framework: .metalrt,
406+
artifactType: .archive(.tarGz, structure: .nestedDirectory),
407+
memoryRequirement: 800_000_000
408+
)
409+
}
410+
411+
// --- MetalRT STT models (Whisper) ---
412+
413+
if let url = URL(string: "\(metalrtBase)/whisper-tiny-metalrt.tar.gz") {
414+
RunAnywhere.registerModel(
415+
id: "whisper-tiny-metalrt",
416+
name: "Whisper Tiny (MetalRT)",
417+
url: url,
418+
framework: .metalrt,
419+
modality: .speechRecognition,
420+
artifactType: .archive(.tarGz, structure: .nestedDirectory),
421+
memoryRequirement: 35_000_000
422+
)
423+
}
424+
425+
if let url = URL(string: "\(metalrtBase)/whisper-small-metalrt.tar.gz") {
426+
RunAnywhere.registerModel(
427+
id: "whisper-small-metalrt",
428+
name: "Whisper Small (MetalRT)",
429+
url: url,
430+
framework: .metalrt,
431+
modality: .speechRecognition,
432+
artifactType: .archive(.tarGz, structure: .nestedDirectory),
433+
memoryRequirement: 100_000_000
434+
)
435+
}
436+
437+
// --- MetalRT TTS model (Kokoro) ---
438+
439+
if let url = URL(string: "\(metalrtBase)/kokoro-metalrt.tar.gz") {
440+
RunAnywhere.registerModel(
441+
id: "kokoro-metalrt",
442+
name: "Kokoro TTS (MetalRT)",
443+
url: url,
444+
framework: .metalrt,
445+
modality: .speechSynthesis,
446+
artifactType: .archive(.tarGz, structure: .nestedDirectory),
447+
memoryRequirement: 350_000_000
448+
)
449+
}
450+
451+
// --- MetalRT VLM model (Qwen3-VL) ---
452+
453+
if let url = URL(string: "\(metalrtBase)/qwen3-vl-2b-metalrt.tar.gz") {
454+
RunAnywhere.registerModel(
455+
id: "qwen3-vl-2b-metalrt",
456+
name: "Qwen3-VL 2B (MetalRT)",
457+
url: url,
458+
framework: .metalrt,
459+
modality: .multimodal,
460+
artifactType: .archive(.tarGz, structure: .nestedDirectory),
461+
memoryRequirement: 1_800_000_000
462+
)
463+
}
464+
465+
logger.info("✅ MetalRT models registered")
466+
#else
467+
logger.info("ℹ️ MetalRT not available (MetalRTRuntime not linked)")
468+
#endif
355469

356470
// Register VLM (Vision Language) models
357471
// VLM models require 2 files: main model + mmproj (vision projector)

examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Models/BenchmarkTypes.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ struct BenchmarkMetrics: Codable, Sendable {
123123
// LLM-specific
124124
var ttftMs: Double?
125125
var tokensPerSecond: Double?
126+
var prefillTokensPerSecond: Double?
127+
var decodeTokensPerSecond: Double?
126128
var inputTokens: Int?
127129
var outputTokens: Int?
128130

examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Services/BenchmarkRunner.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ final class BenchmarkRunner {
112112

113113
func runBenchmarks(
114114
categories: Set<BenchmarkCategory>,
115+
modelIds: Set<String>? = nil,
115116
onProgress: @escaping @Sendable (BenchmarkProgressUpdate) -> Void
116117
) async throws -> BenchmarkRunOutput {
117118
let preflight = try await preflight(categories: categories)
@@ -129,8 +130,9 @@ final class BenchmarkRunner {
129130
for category in BenchmarkCategory.allCases where categories.contains(category) {
130131
guard let provider = providers[category],
131132
let models = preflight.availableCategories[category] else { continue }
133+
let filteredModels = modelIds == nil ? models : models.filter { modelIds!.contains($0.id) }
132134
let scenarioList = provider.scenarios()
133-
for model in models {
135+
for model in filteredModels {
134136
for scenario in scenarioList {
135137
workItems.append((category, model, scenario))
136138
}

examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Services/LLMBenchmarkProvider.swift

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ struct LLMBenchmarkProvider: BenchmarkScenarioProvider {
2929

3030
let memBefore = SyntheticInputGenerator.availableMemoryBytes()
3131

32+
// Ensure clean state: unload any model left over from Chat or a previous run
33+
try? await RunAnywhere.unloadModel()
34+
3235
// Load
3336
let loadStart = Date()
3437
try await RunAnywhere.loadModel(model.id)
@@ -45,20 +48,32 @@ struct LLMBenchmarkProvider: BenchmarkScenarioProvider {
4548

4649
// Benchmark
4750
let benchStart = Date()
48-
let options = LLMGenerationOptions(maxTokens: maxTokens, temperature: 0.0)
51+
let systemPrompt = "You are a helpful assistant. Always give extremely detailed, thorough responses. Never stop early. Use the full response length available to you. Elaborate on every point with examples and explanations."
52+
let options = LLMGenerationOptions(maxTokens: maxTokens, temperature: 0.0, systemPrompt: systemPrompt)
4953
let streamResult = try await RunAnywhere.generateStream(
50-
"Explain the concept of machine learning in detail.",
54+
"Write a very long and detailed explanation of how neural networks work, covering perceptrons, activation functions, backpropagation, gradient descent, loss functions, convolutional layers, recurrent layers, transformers, attention mechanisms, and training procedures. Be as thorough as possible.",
5155
options: options
5256
)
5357
for try await _ in streamResult.stream {}
5458
let result = try await streamResult.result.value
5559

56-
metrics.endToEndLatencyMs = Date().timeIntervalSince(benchStart) * 1000
60+
let e2eMs = Date().timeIntervalSince(benchStart) * 1000
61+
metrics.endToEndLatencyMs = e2eMs
5762
metrics.ttftMs = result.timeToFirstTokenMs
5863
metrics.tokensPerSecond = result.tokensPerSecond
5964
metrics.inputTokens = result.inputTokens
6065
metrics.outputTokens = result.tokensUsed
6166

67+
if let ttft = result.timeToFirstTokenMs, ttft > 0 {
68+
let decodeMs = e2eMs - ttft
69+
if decodeMs > 0, result.tokensUsed > 0 {
70+
metrics.decodeTokensPerSecond = Double(result.tokensUsed) / (decodeMs / 1000.0)
71+
}
72+
if result.inputTokens > 0 {
73+
metrics.prefillTokensPerSecond = Double(result.inputTokens) / (ttft / 1000.0)
74+
}
75+
}
76+
6277
let memAfter = SyntheticInputGenerator.availableMemoryBytes()
6378
metrics.memoryDeltaBytes = memBefore - memAfter
6479

examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Benchmarks/Utilities/BenchmarkReportFormatter.swift

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ enum BenchmarkReportFormatter {
8282
lines.append("- Warmup: \(String(format: "%.0f", m.warmupTimeMs))ms")
8383
}
8484
lines.append("- End-to-end: \(String(format: "%.0f", m.endToEndLatencyMs))ms")
85+
if let decode = m.decodeTokensPerSecond { lines.append("- Decode: \(String(format: "%.1f", decode)) tok/s") }
86+
if let prefill = m.prefillTokensPerSecond { lines.append("- Prefill: \(String(format: "%.1f", prefill)) tok/s") }
8587
if let tps = m.tokensPerSecond { lines.append("- Tokens/s: \(String(format: "%.1f", tps))") }
8688
if let ttft = m.ttftMs { lines.append("- TTFT: \(String(format: "%.0f", ttft))ms") }
8789
if let inp = m.inputTokens { lines.append("- Input tokens: \(inp)") }
@@ -129,7 +131,7 @@ enum BenchmarkReportFormatter {
129131
// MARK: - File Export: CSV
130132

131133
static func writeCSV(run: BenchmarkRun) -> URL {
132-
var csv = "Category,Scenario,Model,Framework,LoadMs,WarmupMs,E2EMs,TPS,TTFT,RTF,AudioLen,AudioDur,Chars,PromptTok,CompTok,GenMs,MemDeltaBytes,Success,Error\n"
134+
var csv = "Category,Scenario,Model,Framework,LoadMs,WarmupMs,E2EMs,DecodeTPS,PrefillTPS,TPS,TTFT,InTokens,OutTokens,RTF,AudioLen,AudioDur,Chars,PromptTok,CompTok,GenMs,MemDeltaBytes,Success,Error\n"
133135
for r in run.results {
134136
let m = r.metrics
135137
var row: [String] = []
@@ -140,8 +142,12 @@ enum BenchmarkReportFormatter {
140142
row.append(String(format: "%.0f", m.loadTimeMs))
141143
row.append(String(format: "%.0f", m.warmupTimeMs))
142144
row.append(String(format: "%.0f", m.endToEndLatencyMs))
145+
row.append(m.decodeTokensPerSecond.map { String(format: "%.1f", $0) } ?? "")
146+
row.append(m.prefillTokensPerSecond.map { String(format: "%.1f", $0) } ?? "")
143147
row.append(m.tokensPerSecond.map { String(format: "%.1f", $0) } ?? "")
144148
row.append(m.ttftMs.map { String(format: "%.0f", $0) } ?? "")
149+
row.append(m.inputTokens.map { "\($0)" } ?? "")
150+
row.append(m.outputTokens.map { "\($0)" } ?? "")
145151
row.append(m.realTimeFactor.map { String(format: "%.2f", $0) } ?? "")
146152
row.append(m.audioLengthSeconds.map { String(format: "%.1f", $0) } ?? "")
147153
row.append(m.audioDurationSeconds.map { String(format: "%.1f", $0) } ?? "")

0 commit comments

Comments
 (0)