Skip to content

Commit a636697

Browse files
finxing the tts for platform in voice agent
1 parent 28577d9 commit a636697

2 files changed

Lines changed: 64 additions & 41 deletions

File tree

sdk/runanywhere-commons/src/features/voice_agent/voice_agent.cpp

Lines changed: 55 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -608,23 +608,29 @@ rac_result_t rac_voice_agent_process_voice_turn(rac_voice_agent_handle_t handle,
608608
}
609609

610610
// Step 4: Convert Float32 PCM to WAV format for playback
611-
// TTS returns raw Float32 samples, but audio players need WAV format
611+
// Platform TTS (e.g. System TTS) plays audio directly and returns no PCM data.
612+
// Only convert when actual audio data is returned (e.g. Piper/ONNX TTS).
612613
void* wav_data = nullptr;
613614
size_t wav_size = 0;
614-
result = rac_audio_float32_to_wav(tts_result.audio_data, tts_result.audio_size,
615-
tts_result.sample_rate > 0 ? tts_result.sample_rate
616-
: RAC_TTS_DEFAULT_SAMPLE_RATE,
617-
&wav_data, &wav_size);
618615

619-
if (result != RAC_SUCCESS) {
620-
RAC_LOG_ERROR("VoiceAgent", "Failed to convert audio to WAV format");
621-
rac_stt_result_free(&stt_result);
622-
rac_llm_result_free(&llm_result);
623-
rac_tts_result_free(&tts_result);
624-
return result;
625-
}
616+
if (tts_result.audio_data != nullptr && tts_result.audio_size > 0) {
617+
result = rac_audio_float32_to_wav(tts_result.audio_data, tts_result.audio_size,
618+
tts_result.sample_rate > 0 ? tts_result.sample_rate
619+
: RAC_TTS_DEFAULT_SAMPLE_RATE,
620+
&wav_data, &wav_size);
626621

627-
RAC_LOG_DEBUG("VoiceAgent", "Converted PCM to WAV format");
622+
if (result != RAC_SUCCESS) {
623+
RAC_LOG_ERROR("VoiceAgent", "Failed to convert audio to WAV format");
624+
rac_stt_result_free(&stt_result);
625+
rac_llm_result_free(&llm_result);
626+
rac_tts_result_free(&tts_result);
627+
return result;
628+
}
629+
630+
RAC_LOG_DEBUG("VoiceAgent", "Converted PCM to WAV format");
631+
} else {
632+
RAC_LOG_DEBUG("VoiceAgent", "Platform TTS played audio directly — no PCM data to convert");
633+
}
628634

629635
// Build result (mirrors Swift's VoiceAgentResult)
630636
out_result->speech_detected = RAC_TRUE;
@@ -726,25 +732,29 @@ rac_result_t rac_voice_agent_process_stream(rac_voice_agent_handle_t handle, con
726732
}
727733

728734
// Step 4: Convert Float32 PCM to WAV format for playback
735+
// Platform TTS plays audio directly and returns no PCM data — skip conversion.
729736
void* wav_data = nullptr;
730737
size_t wav_size = 0;
731-
result = rac_audio_float32_to_wav(tts_result.audio_data, tts_result.audio_size,
732-
tts_result.sample_rate > 0 ? tts_result.sample_rate
733-
: RAC_TTS_DEFAULT_SAMPLE_RATE,
734-
&wav_data, &wav_size);
735738

736-
if (result != RAC_SUCCESS) {
737-
rac_stt_result_free(&stt_result);
738-
rac_llm_result_free(&llm_result);
739-
rac_tts_result_free(&tts_result);
740-
rac_voice_agent_event_t error_event = {};
741-
error_event.type = RAC_VOICE_AGENT_EVENT_ERROR;
742-
error_event.data.error_code = result;
743-
callback(&error_event, user_data);
744-
return result;
739+
if (tts_result.audio_data != nullptr && tts_result.audio_size > 0) {
740+
result = rac_audio_float32_to_wav(tts_result.audio_data, tts_result.audio_size,
741+
tts_result.sample_rate > 0 ? tts_result.sample_rate
742+
: RAC_TTS_DEFAULT_SAMPLE_RATE,
743+
&wav_data, &wav_size);
744+
745+
if (result != RAC_SUCCESS) {
746+
rac_stt_result_free(&stt_result);
747+
rac_llm_result_free(&llm_result);
748+
rac_tts_result_free(&tts_result);
749+
rac_voice_agent_event_t error_event = {};
750+
error_event.type = RAC_VOICE_AGENT_EVENT_ERROR;
751+
error_event.data.error_code = result;
752+
callback(&error_event, user_data);
753+
return result;
754+
}
745755
}
746756

747-
// Emit audio synthesized event (with WAV data)
757+
// Emit audio synthesized event (with WAV data, or empty for platform TTS)
748758
rac_voice_agent_event_t audio_event = {};
749759
audio_event.type = RAC_VOICE_AGENT_EVENT_AUDIO_SYNTHESIZED;
750760
audio_event.data.audio.audio_data = wav_data;
@@ -845,23 +855,27 @@ rac_result_t rac_voice_agent_synthesize_speech(rac_voice_agent_handle_t handle,
845855
return result;
846856
}
847857

848-
// Convert Float32 PCM to WAV format for playback
849-
void* wav_data = nullptr;
850-
size_t wav_size = 0;
851-
result = rac_audio_float32_to_wav(tts_result.audio_data, tts_result.audio_size,
852-
tts_result.sample_rate > 0 ? tts_result.sample_rate
853-
: RAC_TTS_DEFAULT_SAMPLE_RATE,
854-
&wav_data, &wav_size);
858+
// Platform TTS plays audio directly and returns no PCM data — skip conversion.
859+
if (tts_result.audio_data != nullptr && tts_result.audio_size > 0) {
860+
void* wav_data = nullptr;
861+
size_t wav_size = 0;
862+
result = rac_audio_float32_to_wav(tts_result.audio_data, tts_result.audio_size,
863+
tts_result.sample_rate > 0 ? tts_result.sample_rate
864+
: RAC_TTS_DEFAULT_SAMPLE_RATE,
865+
&wav_data, &wav_size);
855866

856-
if (result != RAC_SUCCESS) {
857-
rac_tts_result_free(&tts_result);
858-
return result;
859-
}
867+
if (result != RAC_SUCCESS) {
868+
rac_tts_result_free(&tts_result);
869+
return result;
870+
}
860871

861-
*out_audio = wav_data;
862-
*out_audio_size = wav_size;
872+
*out_audio = wav_data;
873+
*out_audio_size = wav_size;
874+
} else {
875+
*out_audio = nullptr;
876+
*out_audio_size = 0;
877+
}
863878

864-
// Free the original PCM data
865879
rac_tts_result_free(&tts_result);
866880

867881
return RAC_SUCCESS;

sdk/runanywhere-swift/Sources/RunAnywhere/Features/TTS/System/SystemTTSService.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,15 @@ public final class SystemTTSService: NSObject {
5757
return try await Task.detached { @MainActor [self] in
5858
logger.info("Speaking: '\(text.prefix(50))...'")
5959

60+
// The audio session may still be in .record mode from the Voice Agent's
61+
// audio capture phase. Switch to .playback so AVSpeechSynthesizer can
62+
// actually route audio to the speaker.
63+
#if os(iOS) || os(tvOS)
64+
let audioSession = AVAudioSession.sharedInstance()
65+
try audioSession.setCategory(.playback, mode: .default, options: [.duckOthers])
66+
try audioSession.setActive(true)
67+
#endif
68+
6069
let utterance = createUtterance(text: text, options: options)
6170

6271
return try await withCheckedThrowingContinuation { (continuation: CheckedContinuation<Data, Error>) in

0 commit comments

Comments
 (0)