Skip to content

Commit 9116544

Browse files
Merge PR #469: benchmark timing infrastructure (rebased from #343)
Adds opt-in benchmark timing infrastructure (6-timestamp struct t0..t6, monotonic clock, JSON/CSV logging, thread-safe percentile stats collector, extended device-metrics provider) plumbed through llm_component → rac_llm_service → llamacpp backend + JNI so LLM inference can be timed end-to-end with zero overhead when timing_out=NULL. Conflict resolution: in llm_component.cpp's StreamingMetricsContext, combined pr-472's cancel_flag pointer with pr-469's timing_out pointer — both are orthogonal optional instrumentation hooks. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2 parents df983f6 + 5907ce5 commit 9116544

24 files changed

Lines changed: 2642 additions & 6 deletions

sdk/runanywhere-commons/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,10 @@ set(RAC_CORE_SOURCES
393393
src/core/rac_core.cpp
394394
src/core/rac_error.cpp
395395
src/core/rac_time.cpp
396+
src/core/rac_benchmark.cpp
397+
src/core/rac_benchmark_metrics.cpp
398+
src/core/rac_benchmark_log.cpp
399+
src/core/rac_benchmark_stats.cpp
396400
src/core/rac_memory.cpp
397401
src/core/rac_logger.cpp
398402
src/core/rac_audio_utils.cpp

sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#ifndef RAC_LLM_LLAMACPP_H
1212
#define RAC_LLM_LLAMACPP_H
1313

14+
#include "rac/core/rac_benchmark.h"
1415
#include "rac/core/rac_error.h"
1516
#include "rac/core/rac_types.h"
1617
#include "rac/features/llm/rac_llm.h"
@@ -163,6 +164,32 @@ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream(
163164
rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
164165
rac_llm_llamacpp_stream_callback_fn callback, void* user_data);
165166

167+
/**
168+
* Generates text with streaming callback and benchmark timing.
169+
*
170+
* Same as rac_llm_llamacpp_generate_stream but captures benchmark timing:
171+
* - t2: Before prefill (llama_decode for prompt batch)
172+
* - t3: After prefill completes
173+
* - t5: When decode loop exits (last token)
174+
*
175+
* @param handle Service handle
176+
* @param prompt Input prompt text
177+
* @param options Generation options
178+
* @param callback Callback for each token
179+
* @param user_data User context passed to callback
180+
* @param timing_out Output: Benchmark timing struct, caller-allocated.
181+
* Must remain valid for the duration of the call.
182+
* Caller should initialize via rac_benchmark_timing_init() before passing.
183+
* On success, all t2/t3/t5 fields are populated.
184+
* On failure, status is set but timing fields may be partial.
185+
* Pass NULL to skip timing (zero overhead).
186+
* @return RAC_SUCCESS or error code
187+
*/
188+
RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream_with_timing(
189+
rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
190+
rac_llm_llamacpp_stream_callback_fn callback, void* user_data,
191+
rac_benchmark_timing_t* timing_out);
192+
166193
/**
167194
* Cancels ongoing generation.
168195
*
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
/**
2+
* @file rac_benchmark.h
3+
* @brief RunAnywhere Commons - Benchmark Timing Support
4+
*
5+
* This header provides types and functions for benchmark timing instrumentation.
6+
* The timing struct captures key timestamps during LLM inference for performance
7+
* measurement and analysis.
8+
*
9+
* Design principles:
10+
* - Zero overhead when not benchmarking: timing is opt-in via pointer parameter
11+
* - Monotonic clock: uses steady_clock for accurate cross-platform timing
12+
* - All timestamps are relative to a process-local epoch (not wall-clock)
13+
*/
14+
15+
#ifndef RAC_BENCHMARK_H
16+
#define RAC_BENCHMARK_H
17+
18+
#include "rac/core/rac_types.h"
19+
20+
#ifdef __cplusplus
21+
extern "C" {
22+
#endif
23+
24+
// =============================================================================
25+
// BENCHMARK TIMING STRUCT
26+
// =============================================================================
27+
28+
/**
29+
* Benchmark timing structure for LLM inference.
30+
*
31+
* Captures timestamps at key points during inference:
32+
* - t0: Request start (component API entry)
33+
* - t2: Prefill start (backend, before llama_decode for prompt)
34+
* - t3: Prefill end (backend, after llama_decode returns)
35+
* - t4: First token (component, first token callback)
36+
* - t5: Last token (backend, decode loop exits)
37+
* - t6: Request end (component, before complete callback)
38+
*
39+
* All timestamps are in milliseconds from a process-local epoch.
40+
* Use rac_monotonic_now_ms() to get comparable timestamps.
41+
*
42+
* Note: t1 is intentionally skipped to match the specification.
43+
*/
44+
typedef struct rac_benchmark_timing {
45+
/** t0: Request start - recorded at component API entry */
46+
int64_t t0_request_start_ms;
47+
48+
/** t2: Prefill start - recorded before llama_decode for prompt batch */
49+
int64_t t2_prefill_start_ms;
50+
51+
/** t3: Prefill end - recorded after llama_decode returns for prompt */
52+
int64_t t3_prefill_end_ms;
53+
54+
/** t4: First token - recorded when first token callback is invoked */
55+
int64_t t4_first_token_ms;
56+
57+
/** t5: Last token - recorded when decode loop exits */
58+
int64_t t5_last_token_ms;
59+
60+
/** t6: Request end - recorded before complete callback */
61+
int64_t t6_request_end_ms;
62+
63+
/** Number of tokens in the prompt */
64+
int32_t prompt_tokens;
65+
66+
/** Number of tokens generated */
67+
int32_t output_tokens;
68+
69+
/**
70+
* Status of the benchmark request.
71+
* Uses RAC_BENCHMARK_STATUS_* codes:
72+
* - RAC_BENCHMARK_STATUS_SUCCESS (0): Completed successfully
73+
* - RAC_BENCHMARK_STATUS_ERROR (1): Failed
74+
* - RAC_BENCHMARK_STATUS_TIMEOUT (2): Timed out
75+
* - RAC_BENCHMARK_STATUS_CANCELLED (3): Cancelled
76+
*/
77+
int32_t status;
78+
79+
/**
80+
* Specific error code when status is not RAC_BENCHMARK_STATUS_SUCCESS.
81+
* Uses rac_result_t error codes (e.g., RAC_ERROR_NOT_SUPPORTED).
82+
* Set to RAC_SUCCESS (0) when status is RAC_BENCHMARK_STATUS_SUCCESS.
83+
*/
84+
rac_result_t error_code;
85+
86+
} rac_benchmark_timing_t;
87+
88+
// =============================================================================
89+
// BENCHMARK STATUS CODES
90+
// =============================================================================
91+
92+
/** Benchmark request completed successfully */
93+
#define RAC_BENCHMARK_STATUS_SUCCESS ((int32_t)0)
94+
95+
/** Benchmark request failed due to error */
96+
#define RAC_BENCHMARK_STATUS_ERROR ((int32_t)1)
97+
98+
/** Benchmark request timed out */
99+
#define RAC_BENCHMARK_STATUS_TIMEOUT ((int32_t)2)
100+
101+
/** Benchmark request was cancelled */
102+
#define RAC_BENCHMARK_STATUS_CANCELLED ((int32_t)3)
103+
104+
// =============================================================================
105+
// MONOTONIC TIME API
106+
// =============================================================================
107+
108+
/**
109+
* Gets the current monotonic time in milliseconds.
110+
*
111+
* Uses std::chrono::steady_clock for accurate, monotonic timing that is not
112+
* affected by system clock changes. The returned value is relative to a
113+
* process-local epoch (the first call to this function).
114+
*
115+
* This function is thread-safe and lock-free on all supported platforms.
116+
*
117+
* @return Current monotonic time in milliseconds from process-local epoch
118+
*/
119+
RAC_API int64_t rac_monotonic_now_ms(void);
120+
121+
// =============================================================================
122+
// UTILITY FUNCTIONS
123+
// =============================================================================
124+
125+
/**
126+
* Initializes a benchmark timing struct to zero values.
127+
*
128+
* @param timing Pointer to timing struct to initialize
129+
*/
130+
RAC_API void rac_benchmark_timing_init(rac_benchmark_timing_t* timing);
131+
132+
#ifdef __cplusplus
133+
}
134+
#endif
135+
136+
#endif /* RAC_BENCHMARK_H */
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
/**
2+
* @file rac_benchmark_log.h
3+
* @brief RunAnywhere Commons - Benchmark Logging and Serialization
4+
*
5+
* Provides functions to serialize benchmark timing data as JSON or CSV,
6+
* and to log benchmark results via the RAC logging system.
7+
*
8+
* All functions return rac_result_t for consistent error handling.
9+
* Serialization functions write a heap-allocated string to an out-parameter
10+
* (caller must free() on success).
11+
*
12+
* Usage:
13+
* // Log timing summary
14+
* rac_benchmark_timing_log(&timing, "inference_run_1");
15+
*
16+
* // Export as JSON
17+
* char* json = NULL;
18+
* if (rac_benchmark_timing_to_json(&timing, &json) == RAC_SUCCESS) {
19+
* // ... use json ...
20+
* free(json);
21+
* }
22+
*
23+
* // Export as CSV
24+
* char* header = NULL;
25+
* char* row = NULL;
26+
* rac_benchmark_timing_to_csv(NULL, RAC_TRUE, &header);
27+
* rac_benchmark_timing_to_csv(&timing, RAC_FALSE, &row);
28+
* free(header);
29+
* free(row);
30+
*/
31+
32+
#ifndef RAC_BENCHMARK_LOG_H
33+
#define RAC_BENCHMARK_LOG_H
34+
35+
#include "rac/core/rac_benchmark.h"
36+
#include "rac/core/rac_error.h"
37+
#include "rac/core/rac_types.h"
38+
39+
#ifdef __cplusplus
40+
extern "C" {
41+
#endif
42+
43+
// =============================================================================
44+
// JSON SERIALIZATION
45+
// =============================================================================
46+
47+
/**
48+
* Serializes a benchmark timing struct as a JSON string.
49+
*
50+
* Includes all timing fields plus derived metrics:
51+
* - ttft_ms: Time to first token (t4 - t0)
52+
* - prefill_ms: Prefill duration (t3 - t2)
53+
* - decode_ms: Decode duration (t5 - t3)
54+
* - e2e_ms: End-to-end latency (t6 - t0)
55+
* - decode_tps: Decode throughput (output_tokens / decode_ms * 1000)
56+
*
57+
* On success, *out_json is set to a heap-allocated string that the caller
58+
* must release via free(). On failure, *out_json is set to NULL.
59+
*
60+
* @param timing Timing struct to serialize (must not be NULL)
61+
* @param out_json Output pointer that receives the JSON string (must not be NULL)
62+
* @return RAC_SUCCESS on success,
63+
* RAC_ERROR_NULL_POINTER if timing or out_json is NULL,
64+
* RAC_ERROR_OUT_OF_MEMORY if allocation fails
65+
*/
66+
RAC_API rac_result_t rac_benchmark_timing_to_json(const rac_benchmark_timing_t* timing,
67+
char** out_json);
68+
69+
// =============================================================================
70+
// CSV SERIALIZATION
71+
// =============================================================================
72+
73+
/**
74+
* Serializes a benchmark timing struct as a CSV row.
75+
*
76+
* When header is RAC_TRUE, emits the CSV header row (timing may be NULL).
77+
* When header is RAC_FALSE, emits a data row (timing must not be NULL).
78+
*
79+
* On success, *out_csv is set to a heap-allocated string that the caller
80+
* must release via free(). On failure, *out_csv is set to NULL.
81+
*
82+
* @param timing Timing struct to serialize (ignored when header is RAC_TRUE,
83+
* otherwise must not be NULL)
84+
* @param header If RAC_TRUE, emits the CSV header row instead of data
85+
* @param out_csv Output pointer that receives the CSV string (must not be NULL)
86+
* @return RAC_SUCCESS on success,
87+
* RAC_ERROR_NULL_POINTER if out_csv is NULL, or if header is RAC_FALSE
88+
* and timing is NULL,
89+
* RAC_ERROR_OUT_OF_MEMORY if allocation fails
90+
*/
91+
RAC_API rac_result_t rac_benchmark_timing_to_csv(const rac_benchmark_timing_t* timing,
92+
rac_bool_t header,
93+
char** out_csv);
94+
95+
// =============================================================================
96+
// LOGGING
97+
// =============================================================================
98+
99+
/**
100+
* Logs a benchmark timing summary via the RAC logging system.
101+
*
102+
* Outputs key metrics at INFO level under the "Benchmark" category:
103+
* - TTFT, prefill time, decode time, E2E latency
104+
* - Token counts and throughput
105+
* - Status and error code
106+
*
107+
* @param timing Timing struct to log (must not be NULL)
108+
* @param label Optional label for this benchmark run (may be NULL)
109+
* @return RAC_SUCCESS on success,
110+
* RAC_ERROR_NULL_POINTER if timing is NULL
111+
*/
112+
RAC_API rac_result_t rac_benchmark_timing_log(const rac_benchmark_timing_t* timing,
113+
const char* label);
114+
115+
#ifdef __cplusplus
116+
}
117+
#endif
118+
119+
#endif /* RAC_BENCHMARK_LOG_H */

0 commit comments

Comments
 (0)