|
5 | 5 |
|
6 | 6 | #include "rac_stt_metalrt.h" |
7 | 7 |
|
| 8 | +#include <cstdint> |
8 | 9 | #include <cstdlib> |
9 | 10 | #include <cstring> |
| 11 | +#include <vector> |
10 | 12 |
|
11 | 13 | #include "metalrt_c_api.h" |
12 | 14 |
|
@@ -65,13 +67,21 @@ rac_result_t rac_stt_metalrt_transcribe(rac_handle_t handle, const void* audio_d |
65 | 67 | auto* impl = static_cast<rac_stt_metalrt_impl*>(handle); |
66 | 68 | if (!impl->loaded) return RAC_ERROR_BACKEND_NOT_READY; |
67 | 69 |
|
68 | | - // MetalRT expects float32 samples + sample count + sample rate |
69 | | - // RAC STT passes raw audio bytes — assume float32 PCM at 16kHz |
70 | | - const auto* samples = static_cast<const float*>(audio_data); |
71 | | - int n_samples = static_cast<int>(audio_size / sizeof(float)); |
| 70 | + // SDK audio capture sends Int16 PCM at 16 kHz. |
| 71 | + // Convert to Float32 normalized [-1.0, 1.0] for metalrt_whisper_transcribe. |
| 72 | + const auto* int16_samples = static_cast<const int16_t*>(audio_data); |
| 73 | + int n_samples = static_cast<int>(audio_size / sizeof(int16_t)); |
72 | 74 | int sample_rate = 16000; |
73 | 75 |
|
74 | | - const char* text = metalrt_whisper_transcribe(impl->handle, samples, n_samples, sample_rate); |
| 76 | + std::vector<float> float_samples(n_samples); |
| 77 | + for (int i = 0; i < n_samples; i++) { |
| 78 | + float_samples[i] = static_cast<float>(int16_samples[i]) / 32768.0f; |
| 79 | + } |
| 80 | + |
| 81 | + RAC_LOG_INFO(LOG_CAT, "Transcribing %d samples (%.1fs) at %d Hz", |
| 82 | + n_samples, static_cast<float>(n_samples) / sample_rate, sample_rate); |
| 83 | + |
| 84 | + const char* text = metalrt_whisper_transcribe(impl->handle, float_samples.data(), n_samples, sample_rate); |
75 | 85 | if (!text) { |
76 | 86 | rac_error_set_details("metalrt_whisper_transcribe returned null"); |
77 | 87 | return RAC_ERROR_INFERENCE_FAILED; |
|
0 commit comments