Fix/372 streaming model import (#395)

ManthanNimodiya · greptile-apps[bot] · web-flow · commit 7094a8cbb8c5 · 2026-02-20T19:45:01.000+05:30
* fix(ios): use actual token count for tok/sec calculation in streaming Replace character-based token estimation (chars / 4) with actual token count tracked by the streaming metrics collector. The collector already counts each token yielded by the C++ streaming callback via recordToken(), making the character estimate unnecessary and inaccurate. Fixes #339 * feat(web): add streaming for large model imports to prevent OOM errors * Update sdk/runanywhere-web/packages/llamacpp/src/Foundation/LlamaCppBridge.ts Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --------- Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift
@@ -368,14 +368,15 @@ private actor LLMStreamingMetricsCollector {
             timeToFirstTokenMs = firstToken.timeIntervalSince(start) * 1000
         }
 
-        let inputTokens = max(1, promptLength / 4)
-        let outputTokens = max(1, fullText.count / 4)
-        let tokensPerSecond = latencyMs > 0 ? Double(outputTokens) / (latencyMs / 1000.0) : 0
+        // Use actual token count from streaming callbacks, not character estimation (fixes #339)
+        let outputTokens = max(1, tokenCount)
+        let totalTimeSec = latencyMs / 1000.0
+        let tokensPerSecond = totalTimeSec > 0 ? Double(outputTokens) / totalTimeSec : 0
 
         return LLMGenerationResult(
             text: fullText,
             thinkingContent: nil,
-            inputTokens: inputTokens,
+            inputTokens: 0,
             tokensUsed: outputTokens,
             modelUsed: modelId,
             latencyMs: latencyMs,
diff --git a/sdk/runanywhere-web/packages/core/src/Infrastructure/LocalFileStorage.ts b/sdk/runanywhere-web/packages/core/src/Infrastructure/LocalFileStorage.ts
@@ -330,6 +330,25 @@ export class LocalFileStorage {
     }
   }
 
+  /**
+   * Load model data from the local filesystem as a ReadableStream.
+   * @param key - Model identifier
+   * @returns Readable stream of the model data, or null if not found
+   */
+  async loadModelStream(key: string): Promise<ReadableStream<Uint8Array> | null> {
+    if (!this.dirHandle || !this._isReady) return null;
+
+    try {
+      const filename = this.sanitizeFilename(key);
+      const fileHandle = await this.dirHandle.getFileHandle(filename);
+      const file = await fileHandle.getFile();
+      logger.info(`Loading model stream from local storage: ${filename} (${(file.size / 1024 / 1024).toFixed(1)} MB)`);
+      return file.stream() as unknown as ReadableStream<Uint8Array>;
+    } catch {
+      return null; // File not found
+    }
+  }
+
   /**
    * Check if a model file exists in local storage.
    * @param key - Model identifier
diff --git a/sdk/runanywhere-web/packages/core/src/Infrastructure/ModelDownloader.ts b/sdk/runanywhere-web/packages/core/src/Infrastructure/ModelDownloader.ts
@@ -533,6 +533,43 @@ export class ModelDownloader {
     return null;
   }
 
+  /** Load data from storage as a ReadableStream. Priority: local filesystem > OPFS > memory cache. */
+  async loadStreamFromOPFS(key: string): Promise<ReadableStream<Uint8Array> | null> {
+    // Try local filesystem first
+    if (this.localFileStorage?.isReady) {
+      const localStream = await this.localFileStorage.loadModelStream(key);
+      if (localStream) {
+        logger.debug(`Loading ${key} stream from local storage`);
+        return localStream;
+      }
+    }
+
+    // Try OPFS
+    const opfsStream = await this.storage.loadModelStream(key);
+    if (opfsStream) {
+      logger.debug(`Loading ${key} stream from OPFS`);
+      return opfsStream;
+    }
+
+    // Clean up corrupted 0-byte entries - we can't easily check length on the stream without consuming it,
+    // so we skip the 0-byte check here for now and rely on loadFromOPFS to clean them up.
+
+    // Fall back to in-memory cache
+    const cached = this.memoryCache.get(key);
+    if (cached) {
+      const sizeMB = cached.length / 1024 / 1024;
+      logger.debug(`Loading ${key} stream from memory cache (${sizeMB.toFixed(1)} MB)`);
+      return new ReadableStream({
+        start(controller) {
+          controller.enqueue(cached);
+          controller.close();
+        }
+      });
+    }
+
+    return null;
+  }
+
   /** Check existence in local storage, OPFS, or in-memory cache. */
   async existsInOPFS(key: string): Promise<boolean> {
     if (this.localFileStorage?.isReady) {
diff --git a/sdk/runanywhere-web/packages/core/src/Infrastructure/ModelLoaderTypes.ts b/sdk/runanywhere-web/packages/core/src/Infrastructure/ModelLoaderTypes.ts
@@ -33,14 +33,18 @@ export interface ModelLoadContext {
   /**
    * Primary model file data (read from storage).
    *
-   * NOTE: This requires the full model file in memory as a Uint8Array.
-   * For the *import* path, files are streamed directly to OPFS/LocalFileStorage
-   * to avoid peak memory issues. However, when *loading* into a WASM backend,
-   * the full buffer is needed because Emscripten's FS.writeFile() requires it.
-   * A future optimisation could use Emscripten's FS.createLazyFile() or
-   * direct OPFS-to-WASM piping to avoid this buffering.
+   * Note: This is optional. Backend loaders that support streaming
+   * should prefer `dataStream` to avoid large memory allocations.
    */
-  data: Uint8Array;
+  data?: Uint8Array;
+
+  /**
+   * Primary model file data as a ReadableStream.
+   *
+   * For large models (e.g. LLMs 2-8GB), use this stream and pipe the chunks
+   * to the WASM backend in pieces, completely avoiding full-file buffering in JS.
+   */
+  dataStream?: ReadableStream<Uint8Array>;
 
   /**
    * Download a file from a URL. Used for on-demand fetching of
diff --git a/sdk/runanywhere-web/packages/core/src/Infrastructure/ModelManager.ts b/sdk/runanywhere-web/packages/core/src/Infrastructure/ModelManager.ts
@@ -326,9 +326,9 @@ class ModelManagerImpl {
         if (!data) throw new Error('Model not downloaded — please download the model first.');
         await this.loadVADModel(model, data);
       } else {
-        const data = await this.downloader.loadFromOPFS(modelId);
-        if (!data) throw new Error('Model not downloaded — please download the model first.');
-        await this.loadLLMModel(model, modelId, data);
+        const dataStream = await this.downloader.loadStreamFromOPFS(modelId);
+        if (!dataStream) throw new Error('Model not downloaded — please download the model first.');
+        await this.loadLLMModel(model, modelId, undefined, dataStream);
       }
 
       this.loadedByCategory.set(category, modelId);
@@ -461,10 +461,11 @@ class ModelManagerImpl {
   /**
    * Build a ModelLoadContext for passing to backend loaders.
    */
-  private buildLoadContext(model: ManagedModel, data: Uint8Array): ModelLoadContext {
+  private buildLoadContext(model: ManagedModel, data?: Uint8Array, dataStream?: ReadableStream<Uint8Array>): ModelLoadContext {
     return {
       model,
       data,
+      dataStream,
       downloadFile: (url: string) => this.downloader.downloadFile(url),
       loadFile: (fileKey: string) => this.downloader.loadFromOPFS(fileKey),
       storeFile: (fileKey: string, fileData: Uint8Array) => this.downloader.storeInOPFS(fileKey, fileData),
@@ -477,9 +478,9 @@ class ModelManagerImpl {
    * The loader (in @runanywhere/web-llamacpp) handles writing to its own
    * Emscripten FS and calling the C API.
    */
-  private async loadLLMModel(model: ManagedModel, _modelId: string, data: Uint8Array): Promise<void> {
+  private async loadLLMModel(model: ManagedModel, _modelId: string, data?: Uint8Array, dataStream?: ReadableStream<Uint8Array>): Promise<void> {
     if (!this.llmLoader) throw new Error('No LLM loader registered. Register the @runanywhere/web-llamacpp package.');
-    const ctx = this.buildLoadContext(model, data);
+    const ctx = this.buildLoadContext(model, data, dataStream);
     await this.llmLoader.loadModelFromData(ctx);
     logger.info(`LLM model loaded: ${model.id}`);
   }
@@ -497,9 +498,9 @@ class ModelManagerImpl {
     if (!mmprojFile) {
       // No mmproj — load as text-only LLM
       logger.warning(`No mmproj found, loading as text-only LLM: ${modelId}`);
-      const data = await this.downloader.loadFromOPFS(modelId);
-      if (!data) throw new Error('Model not downloaded.');
-      await this.loadLLMModel(model, modelId, data);
+      const dataStream = await this.downloader.loadStreamFromOPFS(modelId);
+      if (!dataStream) throw new Error('Model not downloaded.');
+      await this.loadLLMModel(model, modelId, undefined, dataStream);
       return;
     }
 
diff --git a/sdk/runanywhere-web/packages/core/src/Infrastructure/OPFSStorage.ts b/sdk/runanywhere-web/packages/core/src/Infrastructure/OPFSStorage.ts
@@ -59,8 +59,8 @@ export class OPFSStorage {
    */
   static get isSupported(): boolean {
     return typeof navigator !== 'undefined' &&
-           'storage' in navigator &&
-           'getDirectory' in (navigator.storage || {});
+      'storage' in navigator &&
+      'getDirectory' in (navigator.storage || {});
   }
 
   /**
@@ -186,6 +186,27 @@ export class OPFSStorage {
     }
   }
 
+  /**
+   * Load model data from OPFS as a ReadableStream.
+   *
+   * @param key - Model identifier or nested path
+   * @returns Readable stream of the model data, or null if not found
+   */
+  async loadModelStream(key: string): Promise<ReadableStream<Uint8Array> | null> {
+    if (!this.modelsDir) return null;
+
+    try {
+      const dir = await this.resolveParentDir(key, /* create */ false);
+      const filename = this.resolveFilename(key);
+      const fileHandle = await dir.getFileHandle(filename);
+      const file = await fileHandle.getFile();
+      logger.info(`Loading model stream from OPFS: ${key} (${(file.size / 1024 / 1024).toFixed(1)} MB)`);
+      return file.stream() as unknown as ReadableStream<Uint8Array>;
+    } catch {
+      return null; // File not found
+    }
+  }
+
   /**
    * Check if a model exists in OPFS.
    *
diff --git a/sdk/runanywhere-web/packages/llamacpp/src/Extensions/RunAnywhere+TextGeneration.ts b/sdk/runanywhere-web/packages/llamacpp/src/Extensions/RunAnywhere+TextGeneration.ts
@@ -70,13 +70,19 @@ class TextGenerationImpl {
   }
 
   /**
-   * Load an LLM model from raw data via ModelLoadContext.
+   * Load an LLM model from raw data or stream via ModelLoadContext.
    * Implements LLMModelLoader interface for ModelManager integration.
    */
   async loadModelFromData(ctx: ModelLoadContext): Promise<void> {
     const bridge = this.requireBridge();
     const modelPath = `/models/${ctx.model.id}.gguf`;
-    bridge.writeFile(modelPath, ctx.data);
+    if (ctx.dataStream) {
+      await bridge.writeFileStream(modelPath, ctx.dataStream);
+    } else if (ctx.data) {
+      bridge.writeFile(modelPath, ctx.data);
+    } else {
+      throw new Error('No data provided to loadModelFromData');
+    }
     await this.loadModel(modelPath, ctx.model.id, ctx.model.name);
   }
 
@@ -417,7 +423,7 @@ class TextGenerationImpl {
         resolve({ value: undefined as unknown as string, done: true });
       }
 
-      rejectResult?.(streamError);
+      rejectResult?.(streamError!);
 
       m.removeFunction(tokenCbPtr);
       m.removeFunction(completeCbPtr);
diff --git a/sdk/runanywhere-web/packages/llamacpp/src/Foundation/LlamaCppBridge.ts b/sdk/runanywhere-web/packages/llamacpp/src/Foundation/LlamaCppBridge.ts
@@ -326,6 +326,43 @@ export class LlamaCppBridge {
     }
   }
 
+  /**
+   * Write a model from a ReadableStream to this WASM module's Emscripten virtual filesystem.
+   * Useful for loading models without buffering the entire file in JS memory.
+   */
+  async writeFileStream(path: string, stream: ReadableStream<Uint8Array>): Promise<void> {
+    const m = this.module as any;
+    const FS = m.FS;
+    if (!FS) throw new Error('Emscripten FS not available on module');
+
+    const dir = path.substring(0, path.lastIndexOf('/'));
+    if (dir && typeof m.FS_createPath === 'function') {
+      m.FS_createPath('/', dir.replace(/^\//, ''), true, true);
+    }
+
+    try { FS.unlink(path); } catch { /* ignore */ }
+
+    logger.debug(`Streaming to LlamaCpp FS: ${path}...`);
+    const fileStream = FS.open(path, 'w+');
+    try {
+      const reader = stream.getReader();
+      let totalBytes = 0;
+      try {
+        while (true) {
+          const { done, value } = await reader.read();
+          if (done) break;
+          FS.write(fileStream, value, 0, value.length, undefined);
+          totalBytes += value.length;
+        }
+        logger.debug(`Finished streaming ${totalBytes} bytes to LlamaCpp FS: ${path}`);
+      } finally {
+        reader.releaseLock();
+      }
+    } finally {
+      FS.close(fileStream);
+    }
+  }
+
   /**
    * Remove a file from this WASM module's filesystem.
    */
diff --git a/sdk/runanywhere-web/packages/llamacpp/src/types/LLMTypes.ts b/sdk/runanywhere-web/packages/llamacpp/src/types/LLMTypes.ts
@@ -18,6 +18,7 @@ export interface LLMGenerationOptions {
 }
 
 export interface LLMGenerationResult {
+  [key: string]: unknown;
   text: string;
   thinkingContent?: string;
   inputTokens: number;
diff --git a/sdk/runanywhere-web/packages/onnx/src/Extensions/STTTypes.ts b/sdk/runanywhere-web/packages/onnx/src/Extensions/STTTypes.ts
@@ -48,6 +48,7 @@ export interface STTParaformerFiles {
 }
 
 export interface STTTranscriptionResult {
+  [key: string]: unknown;
   text: string;
   confidence: number;
   detectedLanguage?: string;
diff --git a/sdk/runanywhere-web/packages/onnx/src/Extensions/TTSTypes.ts b/sdk/runanywhere-web/packages/onnx/src/Extensions/TTSTypes.ts
@@ -15,6 +15,7 @@ export interface TTSVoiceConfig {
 }
 
 export interface TTSSynthesisResult {
+  [key: string]: unknown;
   /** Raw PCM audio data */
   audioData: Float32Array;
   /** Audio sample rate */
diff --git a/sdk/runanywhere-web/packages/onnx/src/ONNXProvider.ts b/sdk/runanywhere-web/packages/onnx/src/ONNXProvider.ts
@@ -44,12 +44,16 @@ const sttModelLoader = {
     const sherpa = SherpaONNXBridge.shared;
     await sherpa.ensureLoaded();
 
+    if (!ctx.data) {
+      throw new Error('No data provided for STT model');
+    }
+
     const modelDir = `/models/${ctx.model.id}`;
 
     if (ctx.model.isArchive) {
-      await loadSTTFromArchive(ctx, sherpa, modelDir);
+      await loadSTTFromArchive(ctx as ModelLoadContext & { data: Uint8Array }, sherpa, modelDir);
     } else {
-      await loadSTTFromIndividualFiles(ctx, sherpa, modelDir);
+      await loadSTTFromIndividualFiles(ctx as ModelLoadContext & { data: Uint8Array }, sherpa, modelDir);
     }
   },
 
@@ -59,7 +63,7 @@ const sttModelLoader = {
 };
 
 async function loadSTTFromArchive(
-  ctx: ModelLoadContext,
+  ctx: ModelLoadContext & { data: Uint8Array },
   sherpa: SherpaONNXBridge,
   modelDir: string,
 ): Promise<void> {
@@ -131,7 +135,7 @@ async function loadSTTFromArchive(
 }
 
 async function loadSTTFromIndividualFiles(
-  ctx: ModelLoadContext,
+  ctx: ModelLoadContext & { data: Uint8Array },
   sherpa: SherpaONNXBridge,
   modelDir: string,
 ): Promise<void> {
@@ -220,12 +224,16 @@ const ttsModelLoader = {
     const sherpa = SherpaONNXBridge.shared;
     await sherpa.ensureLoaded();
 
+    if (!ctx.data) {
+      throw new Error('No data provided for TTS model');
+    }
+
     const modelDir = `/models/${ctx.model.id}`;
 
     if (ctx.model.isArchive) {
-      await loadTTSFromArchive(ctx, sherpa, modelDir);
+      await loadTTSFromArchive(ctx as ModelLoadContext & { data: Uint8Array }, sherpa, modelDir);
     } else {
-      await loadTTSFromIndividualFiles(ctx, sherpa, modelDir);
+      await loadTTSFromIndividualFiles(ctx as ModelLoadContext & { data: Uint8Array }, sherpa, modelDir);
     }
   },
 
@@ -235,7 +243,7 @@ const ttsModelLoader = {
 };
 
 async function loadTTSFromArchive(
-  ctx: ModelLoadContext,
+  ctx: ModelLoadContext & { data: Uint8Array },
   sherpa: SherpaONNXBridge,
   modelDir: string,
 ): Promise<void> {
@@ -279,7 +287,7 @@ async function loadTTSFromArchive(
 }
 
 async function loadTTSFromIndividualFiles(
-  ctx: ModelLoadContext,
+  ctx: ModelLoadContext & { data: Uint8Array },
   sherpa: SherpaONNXBridge,
   modelDir: string,
 ): Promise<void> {
@@ -324,6 +332,10 @@ const vadModelLoader = {
     const sherpa = SherpaONNXBridge.shared;
     await sherpa.ensureLoaded();
 
+    if (!ctx.data) {
+      throw new Error('No data provided for VAD model');
+    }
+
     const modelDir = `/models/${ctx.model.id}`;
     const filename = ctx.model.url?.split('/').pop() ?? 'silero_vad.onnx';
     const fsPath = `${modelDir}/${filename}`;

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@ export interface LLMGenerationOptions {`
`18`	`18`	`}`
`19`	`19`
`20`	`20`	`export interface LLMGenerationResult {`
	`21`	`+ [key: string]: unknown;`
`21`	`22`	`text: string;`
`22`	`23`	`thinkingContent?: string;`
`23`	`24`	`inputTokens: number;`
Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@ export interface STTParaformerFiles {`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`export interface STTTranscriptionResult {`
	`51`	`+ [key: string]: unknown;`
`51`	`52`	`text: string;`
`52`	`53`	`confidence: number;`
`53`	`54`	`detectedLanguage?: string;`