Skip to content

Commit 85d1f98

Browse files
Merge PR #471: VAD model loading infrastructure + iOS VAD playground
Introduces first-class VAD model-loading path in C commons (new rac_vad_service_t vtable and rac_vad_component_load_model/is_loaded/unload API), registers ONNX as a VAD provider (Silero), exposes it through the Swift SDK, ships an iOS demo playground, and bundles YapRun keyboard refinements. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2 parents ae5fea9 + 1792730 commit 85d1f98

39 files changed

Lines changed: 1736 additions & 150 deletions

Playground/YapRun/YapRun/Features/Playground/PlaygroundView.swift

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@ struct PlaygroundView: View {
4141
}
4242
}
4343
.task { await viewModel.checkModelStatus() }
44+
.onDisappear {
45+
// Release audio resources when leaving the tab to prevent conflicts
46+
// with FlowSessionManager's AudioCaptureManager.
47+
if viewModel.isRecording {
48+
Task { await viewModel.toggleRecording() }
49+
}
50+
}
4451
.onReceive(NotificationCenter.default.publisher(for: UIApplication.willEnterForegroundNotification)) { _ in
4552
Task { await viewModel.checkModelStatus() }
4653
}
@@ -126,18 +133,38 @@ struct PlaygroundView: View {
126133
if viewModel.isRecording {
127134
// Recording indicator
128135
VStack(spacing: 12) {
129-
// Elapsed time
130-
Text(formatTime(viewModel.elapsedSeconds))
131-
.font(.system(size: 20, weight: .semibold, design: .monospaced))
132-
.foregroundStyle(Color.red)
136+
// Elapsed time + speech indicator
137+
HStack(spacing: 8) {
138+
Text(formatTime(viewModel.elapsedSeconds))
139+
.font(.system(size: 20, weight: .semibold, design: .monospaced))
140+
.foregroundStyle(Color.red)
141+
142+
if viewModel.isAutoStopEnabled {
143+
Circle()
144+
.fill(viewModel.speechDetected ? Color.green : Color.gray.opacity(0.4))
145+
.frame(width: 8, height: 8)
146+
}
147+
}
133148

134149
// Waveform bars
135150
WaveformBars(level: viewModel.audioLevel)
136151
}
137152
} else {
138-
Text(viewModel.transcription.isEmpty ? "Tap to record" : "Tap to record again")
139-
.font(.subheadline)
140-
.foregroundStyle(AppColors.textTertiary)
153+
VStack(spacing: 12) {
154+
Text(viewModel.transcription.isEmpty ? "Tap to record" : "Tap to record again")
155+
.font(.subheadline)
156+
.foregroundStyle(AppColors.textTertiary)
157+
158+
// Auto-stop toggle
159+
Toggle(isOn: $viewModel.isAutoStopEnabled) {
160+
Label("Auto-stop on silence", systemImage: "waveform.badge.minus")
161+
.font(.caption)
162+
.foregroundStyle(AppColors.textSecondary)
163+
}
164+
.toggleStyle(.switch)
165+
.tint(AppColors.primaryGreen)
166+
.frame(width: 240)
167+
}
141168
}
142169
}
143170
}

Playground/YapRun/YapRun/Features/Playground/PlaygroundViewModel.swift

Lines changed: 85 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,21 @@ final class PlaygroundViewModel {
2525
var errorMessage: String?
2626
var modelName: String?
2727

28+
// MARK: - Auto-Stop (VAD)
29+
30+
var isAutoStopEnabled = false
31+
var speechDetected = false
32+
2833
// MARK: - Private
2934

3035
private let audioCapture = AudioCaptureManager()
3136
private var audioBuffer = Foundation.Data()
3237
private var timerTask: Task<Void, Never>?
38+
private var vadMonitorTask: Task<Void, Never>?
39+
private var vadProcessedBytes = 0
40+
private var silenceStartTime: Date?
41+
private var hasSpeechBeenDetected = false
42+
private let autoStopSilenceDuration: TimeInterval = 2.0
3343
private let logger = Logger(subsystem: "com.runanywhere.yaprun", category: "Playground")
3444

3545
// MARK: - Model Check
@@ -58,6 +68,15 @@ final class PlaygroundViewModel {
5868
return
5969
}
6070

71+
// Prevent conflict with active voice keyboard session (iOS-only).
72+
// FlowSessionManager is compiled `#if os(iOS)` so this check is skipped on macOS.
73+
#if os(iOS)
74+
guard !FlowSessionManager.shared.isActive else {
75+
errorMessage = "Voice keyboard session is active. End it first."
76+
return
77+
}
78+
#endif
79+
6180
let permitted = await audioCapture.requestPermission()
6281
guard permitted else {
6382
errorMessage = "Microphone access is required."
@@ -71,7 +90,7 @@ final class PlaygroundViewModel {
7190

7291
do {
7392
// AudioCaptureManager dispatches this callback on DispatchQueue.main
74-
try audioCapture.startRecording { [weak self] data in
93+
try await audioCapture.startRecording { [weak self] data in
7594
MainActor.assumeIsolated {
7695
guard let self else { return }
7796
self.audioBuffer.append(data)
@@ -80,7 +99,10 @@ final class PlaygroundViewModel {
8099
}
81100
isRecording = true
82101
startTimer()
83-
logger.info("Recording started")
102+
if isAutoStopEnabled {
103+
startVADMonitoring()
104+
}
105+
logger.info("Recording started (autoStop=\(self.isAutoStopEnabled))")
84106
} catch {
85107
errorMessage = "Could not start microphone: \(error.localizedDescription)"
86108
logger.error("Recording start failed: \(error.localizedDescription)")
@@ -91,8 +113,11 @@ final class PlaygroundViewModel {
91113
audioCapture.stopRecording()
92114
isRecording = false
93115
audioLevel = 0
116+
speechDetected = false
94117
timerTask?.cancel()
95118
timerTask = nil
119+
vadMonitorTask?.cancel()
120+
vadMonitorTask = nil
96121

97122
guard !audioBuffer.isEmpty else {
98123
errorMessage = "No audio was captured."
@@ -133,5 +158,63 @@ final class PlaygroundViewModel {
133158
audioBuffer = Foundation.Data()
134159
errorMessage = nil
135160
elapsedSeconds = 0
161+
speechDetected = false
162+
}
163+
164+
// MARK: - VAD Monitoring
165+
166+
private func startVADMonitoring() {
167+
vadProcessedBytes = 0
168+
hasSpeechBeenDetected = false
169+
silenceStartTime = nil
170+
speechDetected = false
171+
172+
vadMonitorTask = Task { [weak self] in
173+
while !Task.isCancelled {
174+
try? await Task.sleep(nanoseconds: 100_000_000) // 100ms
175+
guard let self, !Task.isCancelled, self.isRecording else { break }
176+
await self.processVADChunk()
177+
}
178+
}
179+
}
180+
181+
private func processVADChunk() async {
182+
let currentSize = audioBuffer.count
183+
guard currentSize > vadProcessedBytes else { return }
184+
185+
let newData = audioBuffer.subdata(in: vadProcessedBytes..<currentSize)
186+
vadProcessedBytes = currentSize
187+
188+
let samples = convertInt16ToFloat(newData)
189+
guard !samples.isEmpty else { return }
190+
191+
do {
192+
let isSpeech = try await RunAnywhere.detectSpeech(in: samples)
193+
speechDetected = isSpeech
194+
195+
if isSpeech {
196+
hasSpeechBeenDetected = true
197+
silenceStartTime = nil
198+
} else if hasSpeechBeenDetected {
199+
if silenceStartTime == nil {
200+
silenceStartTime = Date()
201+
} else if let start = silenceStartTime,
202+
Date().timeIntervalSince(start) >= autoStopSilenceDuration
203+
{
204+
logger.info("Auto-stop: \(self.autoStopSilenceDuration)s silence after speech")
205+
await stopAndTranscribe()
206+
}
207+
}
208+
} catch {
209+
logger.error("VAD error: \(error.localizedDescription)")
210+
}
211+
}
212+
213+
private func convertInt16ToFloat(_ data: Foundation.Data) -> [Float] {
214+
let sampleCount = data.count / MemoryLayout<Int16>.size
215+
return data.withUnsafeBytes { rawBuffer in
216+
let int16Buffer = rawBuffer.bindMemory(to: Int16.self)
217+
return (0..<sampleCount).map { Float(int16Buffer[$0]) / 32768.0 }
218+
}
136219
}
137220
}

Playground/YapRun/YapRun/Features/VoiceKeyboard/FlowSessionManager.swift

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ final class FlowSessionManager: ObservableObject {
159159
// Start AVAudioEngine while foregrounded
160160
do {
161161
// AudioCaptureManager dispatches this callback on DispatchQueue.main
162-
try audioCapture.startRecording { [weak self] data in
162+
try await audioCapture.startRecording { [weak self] data in
163163
MainActor.assumeIsolated {
164164
guard let self else { return }
165165
SharedDataBridge.shared.audioLevel = self.audioCapture.audioLevel
@@ -191,6 +191,7 @@ final class FlowSessionManager: ObservableObject {
191191
logger.warning("startListening received in unexpected phase: \(self.sessionPhase.description)")
192192
return
193193
}
194+
logger.info("startListening received; transitioning to listening")
194195
audioBuffer = Data()
195196
transition(to: .listening)
196197
SharedDataBridge.shared.sessionState = "listening"
@@ -206,6 +207,7 @@ final class FlowSessionManager: ObservableObject {
206207
return
207208
}
208209

210+
logger.info("stopListening received; transitioning to transcribing")
209211
transition(to: .transcribing)
210212
SharedDataBridge.shared.sessionState = "transcribing"
211213
if #available(iOS 16.1, *) {
@@ -251,8 +253,20 @@ final class FlowSessionManager: ObservableObject {
251253
logger.info("Transcribing \(audio.count) bytes")
252254

253255
do {
254-
let text = try await RunAnywhere.transcribe(audio)
256+
let text = try await Task.detached(priority: .userInitiated) {
257+
try await RunAnywhere.transcribe(audio)
258+
}.value
255259
logger.info("Transcription complete: \"\(text)\"")
260+
261+
// Task.detached drops structured cancellation, so endSession()/killSession()
262+
// cannot cancel the transcription in flight. If the session was torn down
263+
// while transcription was running, discard the result rather than writing
264+
// it to SharedDataBridge after the session is idle.
265+
guard case .transcribing = sessionPhase else {
266+
logger.info("Session no longer transcribing — discarding result")
267+
return
268+
}
269+
256270
wordCount += text.split(separator: " ").count
257271

258272
if #available(iOS 16.1, *) {
@@ -277,9 +291,15 @@ final class FlowSessionManager: ObservableObject {
277291
SharedDataBridge.shared.lastInsertedText = text
278292
SharedDataBridge.shared.sessionState = "done"
279293

280-
DarwinNotificationCenter.shared.post(
281-
name: SharedConstants.DarwinNotifications.transcriptionReady
282-
)
294+
// Small delay to allow cross-process UserDefaults propagation before posting
295+
// the Darwin notification. Modern iOS syncs UserDefaults automatically within
296+
// ~100ms — the 50ms delay + notification timing accounts for this.
297+
Task {
298+
try? await Task.sleep(nanoseconds: 50_000_000) // 50ms
299+
DarwinNotificationCenter.shared.post(
300+
name: SharedConstants.DarwinNotifications.transcriptionReady
301+
)
302+
}
283303

284304
transition(to: .done(text))
285305
appendHistory(text: text)

Playground/YapRun/YapRun/Shared/SharedConstants.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ enum SharedConstants {
3939
// app → keyboard
4040
static let transcriptionReady = "com.runanywhere.yaprun.keyboard.transcriptionReady"
4141
static let sessionReady = "com.runanywhere.yaprun.session.ready"
42+
static let stateChanged = "com.runanywhere.yaprun.session.stateChanged"
43+
static let audioLevelChanged = "com.runanywhere.yaprun.audioLevelChanged"
4244
// keyboard → app
4345
static let startListening = "com.runanywhere.yaprun.keyboard.startListening"
4446
static let stopListening = "com.runanywhere.yaprun.keyboard.stopListening"

Playground/YapRun/YapRun/Shared/SharedDataBridge.swift

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,20 +74,22 @@ final class SharedDataBridge {
7474

7575
var sessionState: String {
7676
get {
77-
defaults?.synchronize()
7877
return defaults?.string(forKey: SharedConstants.Keys.sessionState) ?? "idle"
7978
}
8079
set {
8180
defaults?.set(newValue, forKey: SharedConstants.Keys.sessionState)
82-
defaults?.synchronize()
81+
// Push-notify the keyboard extension (or main app) immediately so it
82+
// doesn't have to wait for the next poll timer tick to see the change.
83+
DarwinNotificationCenter.shared.post(
84+
name: SharedConstants.DarwinNotifications.stateChanged
85+
)
8386
}
8487
}
8588

8689
// MARK: - Transcription Result
8790

8891
var transcribedText: String? {
8992
get {
90-
defaults?.synchronize()
9193
return defaults?.string(forKey: SharedConstants.Keys.transcribedText)
9294
}
9395
set {
@@ -96,7 +98,6 @@ final class SharedDataBridge {
9698
} else {
9799
defaults?.removeObject(forKey: SharedConstants.Keys.transcribedText)
98100
}
99-
defaults?.synchronize()
100101
}
101102
}
102103

@@ -118,7 +119,14 @@ final class SharedDataBridge {
118119

119120
var audioLevel: Float {
120121
get { defaults?.float(forKey: SharedConstants.Keys.audioLevel) ?? 0 }
121-
set { defaults?.set(newValue, forKey: SharedConstants.Keys.audioLevel) }
122+
set {
123+
defaults?.set(newValue, forKey: SharedConstants.Keys.audioLevel)
124+
// Push-notify the keyboard extension so it can read the cached value
125+
// instead of polling UserDefaults on every waveform tick.
126+
DarwinNotificationCenter.shared.post(
127+
name: SharedConstants.DarwinNotifications.audioLevelChanged
128+
)
129+
}
122130
}
123131

124132
// MARK: - Heartbeat

Playground/YapRun/YapRun/macOS/Services/MacDictationService.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ final class MacDictationService {
166166

167167
do {
168168
// AudioCaptureManager dispatches this callback on DispatchQueue.main
169-
try audioCapture.startRecording { [weak self] data in
169+
try await audioCapture.startRecording { [weak self] data in
170170
MainActor.assumeIsolated {
171171
guard let self else { return }
172172
self.audioBuffer.append(data)

0 commit comments

Comments
 (0)