File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -177,7 +177,12 @@ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream(
177177 * @param options Generation options
178178 * @param callback Callback for each token
179179 * @param user_data User context passed to callback
180- * @param timing_out Output: Benchmark timing (can be NULL for no timing)
180+ * @param timing_out Output: Benchmark timing struct, caller-allocated.
181+ * Must remain valid for the duration of the call.
182+ * Caller should initialize via rac_benchmark_timing_init() before passing.
183+ * On success, all t2/t3/t5 fields are populated.
184+ * On failure, status is set but timing fields may be partial.
185+ * Pass NULL to skip timing (zero overhead).
181186 * @return RAC_SUCCESS or error code
182187 */
183188RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream_with_timing (
Original file line number Diff line number Diff line change @@ -67,9 +67,12 @@ typedef struct rac_benchmark_timing {
6767 int32_t output_tokens;
6868
6969 /* *
70- * Status of the request:
71- * - 0: Success
72- * - Non-zero: Error code (from rac_result_t)
70+ * Status of the benchmark request.
71+ * Uses RAC_BENCHMARK_STATUS_* codes:
72+ * - RAC_BENCHMARK_STATUS_SUCCESS (0): Completed successfully
73+ * - RAC_BENCHMARK_STATUS_ERROR (1): Failed
74+ * - RAC_BENCHMARK_STATUS_TIMEOUT (2): Timed out
75+ * - RAC_BENCHMARK_STATUS_CANCELLED (3): Cancelled
7376 */
7477 int32_t status;
7578
Original file line number Diff line number Diff line change @@ -217,7 +217,13 @@ RAC_API rac_result_t rac_llm_component_generate_stream(
217217 * @param complete_callback Called when generation completes
218218 * @param error_callback Called on error
219219 * @param user_data User context passed to callbacks
220- * @param timing_out Output: Benchmark timing (can be NULL for no timing)
220+ * @param timing_out Output: Benchmark timing struct, caller-allocated.
221+ * Must remain valid for the duration of the call.
222+ * Caller should initialize via rac_benchmark_timing_init() before passing.
223+ * Component fills t0/t4/t6, backend fills t2/t3/t5.
224+ * On success, all timing fields are populated.
225+ * On failure, status is set but timing fields may be partial.
226+ * Pass NULL to skip timing (zero overhead).
221227 * @return RAC_SUCCESS or error code
222228 */
223229RAC_API rac_result_t rac_llm_component_generate_stream_with_timing (
Original file line number Diff line number Diff line change @@ -883,6 +883,11 @@ bool LlamaCppTextGeneration::generate_stream_with_timing(const TextGenerationReq
883883
884884 if (llama_decode (context_, batch) != 0 ) {
885885 LOGE (" llama_decode failed for prompt" );
886+ if (timing_out != nullptr ) {
887+ int64_t now = rac_monotonic_now_ms ();
888+ timing_out->t3_prefill_end_ms = now;
889+ timing_out->t5_last_token_ms = now;
890+ }
886891 llama_batch_free (batch);
887892 return false ;
888893 }
You can’t perform that action at this time.
0 commit comments