Skip to content

Commit c050c22

Browse files
fix(ios-sdk): propagate STTOptions.language to ONNX decoder via withCString (#404)
* fix(ios-sdk): propagate STTOptions.language to C bridge via withCString STTOptions.language was silently ignored during transcription, causing the Whisper ONNX decoder to always default to English regardless of the language passed by the caller. Root causes: - Dangling pointer: (options.language as NSString).utf8String stored a pointer into a temporary NSString. ARC released it after the assignment, leaving cOptions.language pointing to freed memory before the C call. The backend read garbage and fell back to 'en'. - Incomplete mapping: only language and sample_rate were forwarded to rac_stt_options_t. detect_language, enable_punctuation, enable_diarization, enable_timestamps, audio_format, and max_speakers were silently zero-initialized regardless of caller input. - processStreamingAudio never set a language field at all. Fix: add a private STTOptions.withCOptions<T> helper that uses withCString to guarantee pointer validity for the full duration of the C call and maps all fields in one place. Apply it to transcribeWithOptions, transcribeStream, and processStreamingAudio. Fixes #350 * refactor(ios-sdk): remove duplicate withCOptions, use existing STTTypes.swift impl The private withCOptions extension in RunAnywhere+STT.swift duplicated the public STTOptions.withCOptions already defined in STTTypes.swift:127. Both used withCString for pointer safety and mapped all rac_stt_options_t fields identically. Remove the duplicate and update the three call sites to use the public method's UnsafePointer signature (cOptionsPtr instead of &cOptions). * fix(ios-sdk): address review comments on processStreamingAudio and STTStreamingContext - Change processStreamingAudio signature from language: String to options: STTOptions = STTOptions() so detectLanguage and all other fields are preserved, matching transcribeWithOptions/transcribeStream - Add missing isLoaded guard to processStreamingAudio before getHandle(), surfacing an actionable notInitialized error instead of processingFailed - Add clarifying comment on sttResult in processStreamingAudio: value is intentionally unused as the C layer delivers results via CppEventBridge - Fix STTStreamingContext data race: replace @unchecked Sendable + bare var with OSAllocatedUnfairLock-protected finalText so C callback writes and async continuation reads are properly synchronised * fix(ios-sdk): address final review comments on STTStreamingContext and guard order - Mark onPartialResult as @sendable so STTStreamingContext's explicit Sendable conformance is sound; a non-@sendable stored closure makes the conformance unsound under Swift's strict concurrency checking - Swap guard order in processStreamingAudio to match transcribeWithOptions and transcribeStream: getHandle() first, then isLoaded check
1 parent e621246 commit c050c22

1 file changed

Lines changed: 71 additions & 65 deletions

File tree

sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/STT/RunAnywhere+STT.swift

Lines changed: 71 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
@preconcurrency import AVFoundation
1111
import CRACommons
1212
import Foundation
13+
import os
1314

1415
// MARK: - STT Operations
1516

@@ -77,21 +78,18 @@ public extension RunAnywhere {
7778
let audioSizeBytes = audioData.count
7879
let audioLengthSec = estimateAudioLength(dataSize: audioSizeBytes)
7980

80-
// Build C options
81-
var cOptions = rac_stt_options_t()
82-
cOptions.language = (options.language as NSString).utf8String
83-
cOptions.sample_rate = Int32(options.sampleRate)
84-
8581
// Transcribe (C++ emits events)
8682
var sttResult = rac_stt_result_t()
87-
let transcribeResult = audioData.withUnsafeBytes { audioPtr in
88-
rac_stt_component_transcribe(
89-
handle,
90-
audioPtr.baseAddress,
91-
audioData.count,
92-
&cOptions,
93-
&sttResult
94-
)
83+
let transcribeResult = options.withCOptions { cOptionsPtr in
84+
audioData.withUnsafeBytes { audioPtr in
85+
rac_stt_component_transcribe(
86+
handle,
87+
audioPtr.baseAddress,
88+
audioData.count,
89+
cOptionsPtr,
90+
&sttResult
91+
)
92+
}
9593
}
9694

9795
guard transcribeResult == RAC_SUCCESS else {
@@ -213,39 +211,36 @@ public extension RunAnywhere {
213211
let context = STTStreamingContext(onPartialResult: onPartialResult)
214212
let contextPtr = Unmanaged.passRetained(context).toOpaque()
215213

216-
// Build C options
217-
var cOptions = rac_stt_options_t()
218-
cOptions.language = (options.language as NSString).utf8String
219-
cOptions.sample_rate = Int32(options.sampleRate)
220-
221214
// Stream transcription with callback
222-
let result = audioData.withUnsafeBytes { audioPtr in
223-
rac_stt_component_transcribe_stream(
224-
handle,
225-
audioPtr.baseAddress,
226-
audioData.count,
227-
&cOptions,
228-
{ partialText, isFinal, userData in
229-
guard let userData = userData else { return }
230-
let ctx = Unmanaged<STTStreamingContext>.fromOpaque(userData).takeUnretainedValue()
231-
232-
let text = partialText.map { String(cString: $0) } ?? ""
233-
let partialResult = STTTranscriptionResult(
234-
transcript: text,
235-
confidence: nil,
236-
timestamps: nil,
237-
language: nil,
238-
alternatives: nil
239-
)
240-
241-
ctx.onPartialResult(partialResult)
242-
243-
if isFinal == RAC_TRUE {
244-
ctx.finalText = text
245-
}
246-
},
247-
contextPtr
248-
)
215+
let result = options.withCOptions { cOptionsPtr in
216+
audioData.withUnsafeBytes { audioPtr in
217+
rac_stt_component_transcribe_stream(
218+
handle,
219+
audioPtr.baseAddress,
220+
audioData.count,
221+
cOptionsPtr,
222+
{ partialText, isFinal, userData in
223+
guard let userData = userData else { return }
224+
let ctx = Unmanaged<STTStreamingContext>.fromOpaque(userData).takeUnretainedValue()
225+
226+
let text = partialText.map { String(cString: $0) } ?? ""
227+
let partialResult = STTTranscriptionResult(
228+
transcript: text,
229+
confidence: nil,
230+
timestamps: nil,
231+
language: nil,
232+
alternatives: nil
233+
)
234+
235+
ctx.onPartialResult(partialResult)
236+
237+
if isFinal == RAC_TRUE {
238+
ctx.finalText = text
239+
}
240+
},
241+
contextPtr
242+
)
243+
}
249244
}
250245

251246
// Release context
@@ -276,30 +271,34 @@ public extension RunAnywhere {
276271
}
277272

278273
/// Process audio samples for streaming transcription
279-
/// - Parameter samples: Audio samples
280-
static func processStreamingAudio(_ samples: [Float]) async throws {
274+
/// - Parameters:
275+
/// - samples: Audio samples
276+
/// - options: Transcription options (default: STTOptions())
277+
static func processStreamingAudio(_ samples: [Float], options: STTOptions = STTOptions()) async throws {
281278
guard isSDKInitialized else {
282279
throw SDKError.general(.notInitialized, "SDK not initialized")
283280
}
284281

285282
let handle = try await CppBridge.STT.shared.getHandle()
286283

287-
var cOptions = rac_stt_options_t()
288-
cOptions.sample_rate = Int32(RAC_STT_DEFAULT_SAMPLE_RATE)
289-
290-
let data = samples.withUnsafeBufferPointer { buffer in
291-
Data(buffer: buffer)
284+
guard await CppBridge.STT.shared.isLoaded else {
285+
throw SDKError.stt(.notInitialized, "STT model not loaded")
292286
}
293287

288+
let data = samples.withUnsafeBufferPointer { Data(buffer: $0) }
289+
290+
// sttResult is intentionally unused: the C layer delivers results via CppEventBridge events.
294291
var sttResult = rac_stt_result_t()
295-
let transcribeResult = data.withUnsafeBytes { audioPtr in
296-
rac_stt_component_transcribe(
297-
handle,
298-
audioPtr.baseAddress,
299-
data.count,
300-
&cOptions,
301-
&sttResult
302-
)
292+
let transcribeResult = options.withCOptions { cOptionsPtr in
293+
data.withUnsafeBytes { audioPtr in
294+
rac_stt_component_transcribe(
295+
handle,
296+
audioPtr.baseAddress,
297+
data.count,
298+
cOptionsPtr,
299+
&sttResult
300+
)
301+
}
303302
}
304303

305304
if transcribeResult != RAC_SUCCESS {
@@ -325,12 +324,19 @@ public extension RunAnywhere {
325324

326325
// MARK: - Streaming Context Helper
327326

328-
/// Context class for bridging C callbacks to Swift closures
329-
private final class STTStreamingContext: @unchecked Sendable {
330-
let onPartialResult: (STTTranscriptionResult) -> Void
331-
var finalText: String = ""
327+
/// Context class for bridging C callbacks to Swift closures.
328+
/// `finalText` is written by the C callback thread and read by the async
329+
/// continuation — protected by OSAllocatedUnfairLock to prevent data races.
330+
private final class STTStreamingContext: Sendable {
331+
let onPartialResult: @Sendable (STTTranscriptionResult) -> Void
332+
private let _finalText = OSAllocatedUnfairLock(initialState: "")
333+
334+
var finalText: String {
335+
get { _finalText.withLock { $0 } }
336+
set { _finalText.withLock { $0 = newValue } }
337+
}
332338

333-
init(onPartialResult: @escaping (STTTranscriptionResult) -> Void) {
339+
init(onPartialResult: @Sendable @escaping (STTTranscriptionResult) -> Void) {
334340
self.onPartialResult = onPartialResult
335341
}
336342
}

0 commit comments

Comments
 (0)