Skip to content

Commit 525243e

Browse files
VAD model loading infrastructure + iOS VAD playground
- Add rac_vad_component_load_model/is_loaded/unload C API - Add rac_vad_service_t vtable (mirrors STT service pattern) - Wire ONNX backend as VAD provider (Silero VAD) - Dispatch vad_component through service registry when model loaded - Add VADViewModel + VoiceActivityDetectionView to iOS demo app - Bridge: CppBridge+VAD, ModelTypes plumbing, RunAnywhere+ModelManagement - Misc: AudioCaptureManager tweaks, WhisperKitSTTService, voice session Also includes YapRun playground/keyboard extension changes.
1 parent b81095e commit 525243e

39 files changed

Lines changed: 1695 additions & 152 deletions

Playground/YapRun/YapRun/Features/Playground/PlaygroundView.swift

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,13 @@ struct PlaygroundView: View {
4141
}
4242
}
4343
.task { await viewModel.checkModelStatus() }
44+
.onDisappear {
45+
// Release audio resources when leaving the tab to prevent conflicts
46+
// with FlowSessionManager's AudioCaptureManager.
47+
if viewModel.isRecording {
48+
Task { await viewModel.toggleRecording() }
49+
}
50+
}
4451
.onReceive(NotificationCenter.default.publisher(for: UIApplication.willEnterForegroundNotification)) { _ in
4552
Task { await viewModel.checkModelStatus() }
4653
}
@@ -126,18 +133,38 @@ struct PlaygroundView: View {
126133
if viewModel.isRecording {
127134
// Recording indicator
128135
VStack(spacing: 12) {
129-
// Elapsed time
130-
Text(formatTime(viewModel.elapsedSeconds))
131-
.font(.system(size: 20, weight: .semibold, design: .monospaced))
132-
.foregroundStyle(Color.red)
136+
// Elapsed time + speech indicator
137+
HStack(spacing: 8) {
138+
Text(formatTime(viewModel.elapsedSeconds))
139+
.font(.system(size: 20, weight: .semibold, design: .monospaced))
140+
.foregroundStyle(Color.red)
141+
142+
if viewModel.isAutoStopEnabled {
143+
Circle()
144+
.fill(viewModel.speechDetected ? Color.green : Color.gray.opacity(0.4))
145+
.frame(width: 8, height: 8)
146+
}
147+
}
133148

134149
// Waveform bars
135150
WaveformBars(level: viewModel.audioLevel)
136151
}
137152
} else {
138-
Text(viewModel.transcription.isEmpty ? "Tap to record" : "Tap to record again")
139-
.font(.subheadline)
140-
.foregroundStyle(AppColors.textTertiary)
153+
VStack(spacing: 12) {
154+
Text(viewModel.transcription.isEmpty ? "Tap to record" : "Tap to record again")
155+
.font(.subheadline)
156+
.foregroundStyle(AppColors.textTertiary)
157+
158+
// Auto-stop toggle
159+
Toggle(isOn: $viewModel.isAutoStopEnabled) {
160+
Label("Auto-stop on silence", systemImage: "waveform.badge.minus")
161+
.font(.caption)
162+
.foregroundStyle(AppColors.textSecondary)
163+
}
164+
.toggleStyle(.switch)
165+
.tint(AppColors.primaryGreen)
166+
.frame(width: 240)
167+
}
141168
}
142169
}
143170
}

Playground/YapRun/YapRun/Features/Playground/PlaygroundViewModel.swift

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,21 @@ final class PlaygroundViewModel {
2525
var errorMessage: String?
2626
var modelName: String?
2727

28+
// MARK: - Auto-Stop (VAD)
29+
30+
var isAutoStopEnabled = false
31+
var speechDetected = false
32+
2833
// MARK: - Private
2934

3035
private let audioCapture = AudioCaptureManager()
3136
private var audioBuffer = Foundation.Data()
3237
private var timerTask: Task<Void, Never>?
38+
private var vadMonitorTask: Task<Void, Never>?
39+
private var vadProcessedBytes = 0
40+
private var silenceStartTime: Date?
41+
private var hasSpeechBeenDetected = false
42+
private let autoStopSilenceDuration: TimeInterval = 2.0
3343
private let logger = Logger(subsystem: "com.runanywhere.yaprun", category: "Playground")
3444

3545
// MARK: - Model Check
@@ -58,6 +68,12 @@ final class PlaygroundViewModel {
5868
return
5969
}
6070

71+
// Prevent conflict with active voice keyboard session
72+
guard !FlowSessionManager.shared.isActive else {
73+
errorMessage = "Voice keyboard session is active. End it first."
74+
return
75+
}
76+
6177
let permitted = await audioCapture.requestPermission()
6278
guard permitted else {
6379
errorMessage = "Microphone access is required."
@@ -71,7 +87,7 @@ final class PlaygroundViewModel {
7187

7288
do {
7389
// AudioCaptureManager dispatches this callback on DispatchQueue.main
74-
try audioCapture.startRecording { [weak self] data in
90+
try await audioCapture.startRecording { [weak self] data in
7591
MainActor.assumeIsolated {
7692
guard let self else { return }
7793
self.audioBuffer.append(data)
@@ -80,7 +96,10 @@ final class PlaygroundViewModel {
8096
}
8197
isRecording = true
8298
startTimer()
83-
logger.info("Recording started")
99+
if isAutoStopEnabled {
100+
startVADMonitoring()
101+
}
102+
logger.info("Recording started (autoStop=\(self.isAutoStopEnabled))")
84103
} catch {
85104
errorMessage = "Could not start microphone: \(error.localizedDescription)"
86105
logger.error("Recording start failed: \(error.localizedDescription)")
@@ -91,8 +110,11 @@ final class PlaygroundViewModel {
91110
audioCapture.stopRecording()
92111
isRecording = false
93112
audioLevel = 0
113+
speechDetected = false
94114
timerTask?.cancel()
95115
timerTask = nil
116+
vadMonitorTask?.cancel()
117+
vadMonitorTask = nil
96118

97119
guard !audioBuffer.isEmpty else {
98120
errorMessage = "No audio was captured."
@@ -133,5 +155,63 @@ final class PlaygroundViewModel {
133155
audioBuffer = Foundation.Data()
134156
errorMessage = nil
135157
elapsedSeconds = 0
158+
speechDetected = false
159+
}
160+
161+
// MARK: - VAD Monitoring
162+
163+
private func startVADMonitoring() {
164+
vadProcessedBytes = 0
165+
hasSpeechBeenDetected = false
166+
silenceStartTime = nil
167+
speechDetected = false
168+
169+
vadMonitorTask = Task { [weak self] in
170+
while !Task.isCancelled {
171+
try? await Task.sleep(nanoseconds: 100_000_000) // 100ms
172+
guard let self, !Task.isCancelled, self.isRecording else { break }
173+
await self.processVADChunk()
174+
}
175+
}
176+
}
177+
178+
private func processVADChunk() async {
179+
let currentSize = audioBuffer.count
180+
guard currentSize > vadProcessedBytes else { return }
181+
182+
let newData = audioBuffer.subdata(in: vadProcessedBytes..<currentSize)
183+
vadProcessedBytes = currentSize
184+
185+
let samples = convertInt16ToFloat(newData)
186+
guard !samples.isEmpty else { return }
187+
188+
do {
189+
let isSpeech = try await RunAnywhere.detectSpeech(in: samples)
190+
speechDetected = isSpeech
191+
192+
if isSpeech {
193+
hasSpeechBeenDetected = true
194+
silenceStartTime = nil
195+
} else if hasSpeechBeenDetected {
196+
if silenceStartTime == nil {
197+
silenceStartTime = Date()
198+
} else if let start = silenceStartTime,
199+
Date().timeIntervalSince(start) >= autoStopSilenceDuration
200+
{
201+
logger.info("Auto-stop: \(self.autoStopSilenceDuration)s silence after speech")
202+
await stopAndTranscribe()
203+
}
204+
}
205+
} catch {
206+
logger.error("VAD error: \(error.localizedDescription)")
207+
}
208+
}
209+
210+
private func convertInt16ToFloat(_ data: Foundation.Data) -> [Float] {
211+
let sampleCount = data.count / MemoryLayout<Int16>.size
212+
return data.withUnsafeBytes { rawBuffer in
213+
let int16Buffer = rawBuffer.bindMemory(to: Int16.self)
214+
return (0..<sampleCount).map { Float(int16Buffer[$0]) / 32768.0 }
215+
}
136216
}
137217
}

Playground/YapRun/YapRun/Features/VoiceKeyboard/FlowSessionManager.swift

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ final class FlowSessionManager: ObservableObject {
159159
// Start AVAudioEngine while foregrounded
160160
do {
161161
// AudioCaptureManager dispatches this callback on DispatchQueue.main
162-
try audioCapture.startRecording { [weak self] data in
162+
try await audioCapture.startRecording { [weak self] data in
163163
MainActor.assumeIsolated {
164164
guard let self else { return }
165165
SharedDataBridge.shared.audioLevel = self.audioCapture.audioLevel
@@ -191,6 +191,7 @@ final class FlowSessionManager: ObservableObject {
191191
logger.warning("startListening received in unexpected phase: \(self.sessionPhase.description)")
192192
return
193193
}
194+
logger.info("startListening received; transitioning to listening")
194195
audioBuffer = Data()
195196
transition(to: .listening)
196197
SharedDataBridge.shared.sessionState = "listening"
@@ -206,6 +207,7 @@ final class FlowSessionManager: ObservableObject {
206207
return
207208
}
208209

210+
logger.info("stopListening received; transitioning to transcribing")
209211
transition(to: .transcribing)
210212
SharedDataBridge.shared.sessionState = "transcribing"
211213
if #available(iOS 16.1, *) {
@@ -251,7 +253,9 @@ final class FlowSessionManager: ObservableObject {
251253
logger.info("Transcribing \(audio.count) bytes")
252254

253255
do {
254-
let text = try await RunAnywhere.transcribe(audio)
256+
let text = try await Task.detached(priority: .userInitiated) {
257+
try await RunAnywhere.transcribe(audio)
258+
}.value
255259
logger.info("Transcription complete: \"\(text)\"")
256260
wordCount += text.split(separator: " ").count
257261

@@ -277,9 +281,15 @@ final class FlowSessionManager: ObservableObject {
277281
SharedDataBridge.shared.lastInsertedText = text
278282
SharedDataBridge.shared.sessionState = "done"
279283

280-
DarwinNotificationCenter.shared.post(
281-
name: SharedConstants.DarwinNotifications.transcriptionReady
282-
)
284+
// Small delay to allow cross-process UserDefaults propagation before posting
285+
// the Darwin notification. Modern iOS syncs UserDefaults automatically within
286+
// ~100ms — the 50ms delay + notification timing accounts for this.
287+
Task {
288+
try? await Task.sleep(nanoseconds: 50_000_000) // 50ms
289+
DarwinNotificationCenter.shared.post(
290+
name: SharedConstants.DarwinNotifications.transcriptionReady
291+
)
292+
}
283293

284294
transition(to: .done(text))
285295
appendHistory(text: text)

Playground/YapRun/YapRun/Shared/SharedConstants.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ enum SharedConstants {
3939
// app → keyboard
4040
static let transcriptionReady = "com.runanywhere.yaprun.keyboard.transcriptionReady"
4141
static let sessionReady = "com.runanywhere.yaprun.session.ready"
42+
static let stateChanged = "com.runanywhere.yaprun.session.stateChanged"
43+
static let audioLevelChanged = "com.runanywhere.yaprun.audioLevelChanged"
4244
// keyboard → app
4345
static let startListening = "com.runanywhere.yaprun.keyboard.startListening"
4446
static let stopListening = "com.runanywhere.yaprun.keyboard.stopListening"

Playground/YapRun/YapRun/Shared/SharedDataBridge.swift

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,20 +74,22 @@ final class SharedDataBridge {
7474

7575
var sessionState: String {
7676
get {
77-
defaults?.synchronize()
7877
return defaults?.string(forKey: SharedConstants.Keys.sessionState) ?? "idle"
7978
}
8079
set {
8180
defaults?.set(newValue, forKey: SharedConstants.Keys.sessionState)
82-
defaults?.synchronize()
81+
// Push-notify the keyboard extension (or main app) immediately so it
82+
// doesn't have to wait for the next poll timer tick to see the change.
83+
DarwinNotificationCenter.shared.post(
84+
name: SharedConstants.DarwinNotifications.stateChanged
85+
)
8386
}
8487
}
8588

8689
// MARK: - Transcription Result
8790

8891
var transcribedText: String? {
8992
get {
90-
defaults?.synchronize()
9193
return defaults?.string(forKey: SharedConstants.Keys.transcribedText)
9294
}
9395
set {
@@ -96,7 +98,6 @@ final class SharedDataBridge {
9698
} else {
9799
defaults?.removeObject(forKey: SharedConstants.Keys.transcribedText)
98100
}
99-
defaults?.synchronize()
100101
}
101102
}
102103

@@ -118,7 +119,14 @@ final class SharedDataBridge {
118119

119120
var audioLevel: Float {
120121
get { defaults?.float(forKey: SharedConstants.Keys.audioLevel) ?? 0 }
121-
set { defaults?.set(newValue, forKey: SharedConstants.Keys.audioLevel) }
122+
set {
123+
defaults?.set(newValue, forKey: SharedConstants.Keys.audioLevel)
124+
// Push-notify the keyboard extension so it can read the cached value
125+
// instead of polling UserDefaults on every waveform tick.
126+
DarwinNotificationCenter.shared.post(
127+
name: SharedConstants.DarwinNotifications.audioLevelChanged
128+
)
129+
}
122130
}
123131

124132
// MARK: - Heartbeat

Playground/YapRun/YapRun/macOS/Services/MacDictationService.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ final class MacDictationService {
166166

167167
do {
168168
// AudioCaptureManager dispatches this callback on DispatchQueue.main
169-
try audioCapture.startRecording { [weak self] data in
169+
try await audioCapture.startRecording { [weak self] data in
170170
MainActor.assumeIsolated {
171171
guard let self else { return }
172172
self.audioBuffer.append(data)

0 commit comments

Comments
 (0)