Skip to content

Commit 7094a8c

Browse files
Fix/372 streaming model import (#395)
* fix(ios): use actual token count for tok/sec calculation in streaming Replace character-based token estimation (chars / 4) with actual token count tracked by the streaming metrics collector. The collector already counts each token yielded by the C++ streaming callback via recordToken(), making the character estimate unnecessary and inaccurate. Fixes #339 * feat(web): add streaming for large model imports to prevent OOM errors * Update sdk/runanywhere-web/packages/llamacpp/src/Foundation/LlamaCppBridge.ts Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --------- Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
1 parent 246aabe commit 7094a8c

12 files changed

Lines changed: 174 additions & 33 deletions

File tree

sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -368,14 +368,15 @@ private actor LLMStreamingMetricsCollector {
368368
timeToFirstTokenMs = firstToken.timeIntervalSince(start) * 1000
369369
}
370370

371-
let inputTokens = max(1, promptLength / 4)
372-
let outputTokens = max(1, fullText.count / 4)
373-
let tokensPerSecond = latencyMs > 0 ? Double(outputTokens) / (latencyMs / 1000.0) : 0
371+
// Use actual token count from streaming callbacks, not character estimation (fixes #339)
372+
let outputTokens = max(1, tokenCount)
373+
let totalTimeSec = latencyMs / 1000.0
374+
let tokensPerSecond = totalTimeSec > 0 ? Double(outputTokens) / totalTimeSec : 0
374375

375376
return LLMGenerationResult(
376377
text: fullText,
377378
thinkingContent: nil,
378-
inputTokens: inputTokens,
379+
inputTokens: 0,
379380
tokensUsed: outputTokens,
380381
modelUsed: modelId,
381382
latencyMs: latencyMs,

sdk/runanywhere-web/packages/core/src/Infrastructure/LocalFileStorage.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,25 @@ export class LocalFileStorage {
330330
}
331331
}
332332

333+
/**
334+
* Load model data from the local filesystem as a ReadableStream.
335+
* @param key - Model identifier
336+
* @returns Readable stream of the model data, or null if not found
337+
*/
338+
async loadModelStream(key: string): Promise<ReadableStream<Uint8Array> | null> {
339+
if (!this.dirHandle || !this._isReady) return null;
340+
341+
try {
342+
const filename = this.sanitizeFilename(key);
343+
const fileHandle = await this.dirHandle.getFileHandle(filename);
344+
const file = await fileHandle.getFile();
345+
logger.info(`Loading model stream from local storage: ${filename} (${(file.size / 1024 / 1024).toFixed(1)} MB)`);
346+
return file.stream() as unknown as ReadableStream<Uint8Array>;
347+
} catch {
348+
return null; // File not found
349+
}
350+
}
351+
333352
/**
334353
* Check if a model file exists in local storage.
335354
* @param key - Model identifier

sdk/runanywhere-web/packages/core/src/Infrastructure/ModelDownloader.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,43 @@ export class ModelDownloader {
533533
return null;
534534
}
535535

536+
/** Load data from storage as a ReadableStream. Priority: local filesystem > OPFS > memory cache. */
537+
async loadStreamFromOPFS(key: string): Promise<ReadableStream<Uint8Array> | null> {
538+
// Try local filesystem first
539+
if (this.localFileStorage?.isReady) {
540+
const localStream = await this.localFileStorage.loadModelStream(key);
541+
if (localStream) {
542+
logger.debug(`Loading ${key} stream from local storage`);
543+
return localStream;
544+
}
545+
}
546+
547+
// Try OPFS
548+
const opfsStream = await this.storage.loadModelStream(key);
549+
if (opfsStream) {
550+
logger.debug(`Loading ${key} stream from OPFS`);
551+
return opfsStream;
552+
}
553+
554+
// Clean up corrupted 0-byte entries - we can't easily check length on the stream without consuming it,
555+
// so we skip the 0-byte check here for now and rely on loadFromOPFS to clean them up.
556+
557+
// Fall back to in-memory cache
558+
const cached = this.memoryCache.get(key);
559+
if (cached) {
560+
const sizeMB = cached.length / 1024 / 1024;
561+
logger.debug(`Loading ${key} stream from memory cache (${sizeMB.toFixed(1)} MB)`);
562+
return new ReadableStream({
563+
start(controller) {
564+
controller.enqueue(cached);
565+
controller.close();
566+
}
567+
});
568+
}
569+
570+
return null;
571+
}
572+
536573
/** Check existence in local storage, OPFS, or in-memory cache. */
537574
async existsInOPFS(key: string): Promise<boolean> {
538575
if (this.localFileStorage?.isReady) {

sdk/runanywhere-web/packages/core/src/Infrastructure/ModelLoaderTypes.ts

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,18 @@ export interface ModelLoadContext {
3333
/**
3434
* Primary model file data (read from storage).
3535
*
36-
* NOTE: This requires the full model file in memory as a Uint8Array.
37-
* For the *import* path, files are streamed directly to OPFS/LocalFileStorage
38-
* to avoid peak memory issues. However, when *loading* into a WASM backend,
39-
* the full buffer is needed because Emscripten's FS.writeFile() requires it.
40-
* A future optimisation could use Emscripten's FS.createLazyFile() or
41-
* direct OPFS-to-WASM piping to avoid this buffering.
36+
* Note: This is optional. Backend loaders that support streaming
37+
* should prefer `dataStream` to avoid large memory allocations.
4238
*/
43-
data: Uint8Array;
39+
data?: Uint8Array;
40+
41+
/**
42+
* Primary model file data as a ReadableStream.
43+
*
44+
* For large models (e.g. LLMs 2-8GB), use this stream and pipe the chunks
45+
* to the WASM backend in pieces, completely avoiding full-file buffering in JS.
46+
*/
47+
dataStream?: ReadableStream<Uint8Array>;
4448

4549
/**
4650
* Download a file from a URL. Used for on-demand fetching of

sdk/runanywhere-web/packages/core/src/Infrastructure/ModelManager.ts

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -326,9 +326,9 @@ class ModelManagerImpl {
326326
if (!data) throw new Error('Model not downloaded — please download the model first.');
327327
await this.loadVADModel(model, data);
328328
} else {
329-
const data = await this.downloader.loadFromOPFS(modelId);
330-
if (!data) throw new Error('Model not downloaded — please download the model first.');
331-
await this.loadLLMModel(model, modelId, data);
329+
const dataStream = await this.downloader.loadStreamFromOPFS(modelId);
330+
if (!dataStream) throw new Error('Model not downloaded — please download the model first.');
331+
await this.loadLLMModel(model, modelId, undefined, dataStream);
332332
}
333333

334334
this.loadedByCategory.set(category, modelId);
@@ -461,10 +461,11 @@ class ModelManagerImpl {
461461
/**
462462
* Build a ModelLoadContext for passing to backend loaders.
463463
*/
464-
private buildLoadContext(model: ManagedModel, data: Uint8Array): ModelLoadContext {
464+
private buildLoadContext(model: ManagedModel, data?: Uint8Array, dataStream?: ReadableStream<Uint8Array>): ModelLoadContext {
465465
return {
466466
model,
467467
data,
468+
dataStream,
468469
downloadFile: (url: string) => this.downloader.downloadFile(url),
469470
loadFile: (fileKey: string) => this.downloader.loadFromOPFS(fileKey),
470471
storeFile: (fileKey: string, fileData: Uint8Array) => this.downloader.storeInOPFS(fileKey, fileData),
@@ -477,9 +478,9 @@ class ModelManagerImpl {
477478
* The loader (in @runanywhere/web-llamacpp) handles writing to its own
478479
* Emscripten FS and calling the C API.
479480
*/
480-
private async loadLLMModel(model: ManagedModel, _modelId: string, data: Uint8Array): Promise<void> {
481+
private async loadLLMModel(model: ManagedModel, _modelId: string, data?: Uint8Array, dataStream?: ReadableStream<Uint8Array>): Promise<void> {
481482
if (!this.llmLoader) throw new Error('No LLM loader registered. Register the @runanywhere/web-llamacpp package.');
482-
const ctx = this.buildLoadContext(model, data);
483+
const ctx = this.buildLoadContext(model, data, dataStream);
483484
await this.llmLoader.loadModelFromData(ctx);
484485
logger.info(`LLM model loaded: ${model.id}`);
485486
}
@@ -497,9 +498,9 @@ class ModelManagerImpl {
497498
if (!mmprojFile) {
498499
// No mmproj — load as text-only LLM
499500
logger.warning(`No mmproj found, loading as text-only LLM: ${modelId}`);
500-
const data = await this.downloader.loadFromOPFS(modelId);
501-
if (!data) throw new Error('Model not downloaded.');
502-
await this.loadLLMModel(model, modelId, data);
501+
const dataStream = await this.downloader.loadStreamFromOPFS(modelId);
502+
if (!dataStream) throw new Error('Model not downloaded.');
503+
await this.loadLLMModel(model, modelId, undefined, dataStream);
503504
return;
504505
}
505506

sdk/runanywhere-web/packages/core/src/Infrastructure/OPFSStorage.ts

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ export class OPFSStorage {
5959
*/
6060
static get isSupported(): boolean {
6161
return typeof navigator !== 'undefined' &&
62-
'storage' in navigator &&
63-
'getDirectory' in (navigator.storage || {});
62+
'storage' in navigator &&
63+
'getDirectory' in (navigator.storage || {});
6464
}
6565

6666
/**
@@ -186,6 +186,27 @@ export class OPFSStorage {
186186
}
187187
}
188188

189+
/**
190+
* Load model data from OPFS as a ReadableStream.
191+
*
192+
* @param key - Model identifier or nested path
193+
* @returns Readable stream of the model data, or null if not found
194+
*/
195+
async loadModelStream(key: string): Promise<ReadableStream<Uint8Array> | null> {
196+
if (!this.modelsDir) return null;
197+
198+
try {
199+
const dir = await this.resolveParentDir(key, /* create */ false);
200+
const filename = this.resolveFilename(key);
201+
const fileHandle = await dir.getFileHandle(filename);
202+
const file = await fileHandle.getFile();
203+
logger.info(`Loading model stream from OPFS: ${key} (${(file.size / 1024 / 1024).toFixed(1)} MB)`);
204+
return file.stream() as unknown as ReadableStream<Uint8Array>;
205+
} catch {
206+
return null; // File not found
207+
}
208+
}
209+
189210
/**
190211
* Check if a model exists in OPFS.
191212
*

sdk/runanywhere-web/packages/llamacpp/src/Extensions/RunAnywhere+TextGeneration.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,13 +70,19 @@ class TextGenerationImpl {
7070
}
7171

7272
/**
73-
* Load an LLM model from raw data via ModelLoadContext.
73+
* Load an LLM model from raw data or stream via ModelLoadContext.
7474
* Implements LLMModelLoader interface for ModelManager integration.
7575
*/
7676
async loadModelFromData(ctx: ModelLoadContext): Promise<void> {
7777
const bridge = this.requireBridge();
7878
const modelPath = `/models/${ctx.model.id}.gguf`;
79-
bridge.writeFile(modelPath, ctx.data);
79+
if (ctx.dataStream) {
80+
await bridge.writeFileStream(modelPath, ctx.dataStream);
81+
} else if (ctx.data) {
82+
bridge.writeFile(modelPath, ctx.data);
83+
} else {
84+
throw new Error('No data provided to loadModelFromData');
85+
}
8086
await this.loadModel(modelPath, ctx.model.id, ctx.model.name);
8187
}
8288

@@ -417,7 +423,7 @@ class TextGenerationImpl {
417423
resolve({ value: undefined as unknown as string, done: true });
418424
}
419425

420-
rejectResult?.(streamError);
426+
rejectResult?.(streamError!);
421427

422428
m.removeFunction(tokenCbPtr);
423429
m.removeFunction(completeCbPtr);

sdk/runanywhere-web/packages/llamacpp/src/Foundation/LlamaCppBridge.ts

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,43 @@ export class LlamaCppBridge {
326326
}
327327
}
328328

329+
/**
330+
* Write a model from a ReadableStream to this WASM module's Emscripten virtual filesystem.
331+
* Useful for loading models without buffering the entire file in JS memory.
332+
*/
333+
async writeFileStream(path: string, stream: ReadableStream<Uint8Array>): Promise<void> {
334+
const m = this.module as any;
335+
const FS = m.FS;
336+
if (!FS) throw new Error('Emscripten FS not available on module');
337+
338+
const dir = path.substring(0, path.lastIndexOf('/'));
339+
if (dir && typeof m.FS_createPath === 'function') {
340+
m.FS_createPath('/', dir.replace(/^\//, ''), true, true);
341+
}
342+
343+
try { FS.unlink(path); } catch { /* ignore */ }
344+
345+
logger.debug(`Streaming to LlamaCpp FS: ${path}...`);
346+
const fileStream = FS.open(path, 'w+');
347+
try {
348+
const reader = stream.getReader();
349+
let totalBytes = 0;
350+
try {
351+
while (true) {
352+
const { done, value } = await reader.read();
353+
if (done) break;
354+
FS.write(fileStream, value, 0, value.length, undefined);
355+
totalBytes += value.length;
356+
}
357+
logger.debug(`Finished streaming ${totalBytes} bytes to LlamaCpp FS: ${path}`);
358+
} finally {
359+
reader.releaseLock();
360+
}
361+
} finally {
362+
FS.close(fileStream);
363+
}
364+
}
365+
329366
/**
330367
* Remove a file from this WASM module's filesystem.
331368
*/

sdk/runanywhere-web/packages/llamacpp/src/types/LLMTypes.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ export interface LLMGenerationOptions {
1818
}
1919

2020
export interface LLMGenerationResult {
21+
[key: string]: unknown;
2122
text: string;
2223
thinkingContent?: string;
2324
inputTokens: number;

sdk/runanywhere-web/packages/onnx/src/Extensions/STTTypes.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ export interface STTParaformerFiles {
4848
}
4949

5050
export interface STTTranscriptionResult {
51+
[key: string]: unknown;
5152
text: string;
5253
confidence: number;
5354
detectedLanguage?: string;

0 commit comments

Comments
 (0)