Skip to content

Commit 56a4f65

Browse files
abhisekupadhyayasanchitmonga22
authored andcommitted
masic metrics commons implemented
1 parent b81095e commit 56a4f65

13 files changed

Lines changed: 888 additions & 0 deletions

File tree

sdk/runanywhere-commons/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ set(RAC_CORE_SOURCES
275275
src/core/rac_core.cpp
276276
src/core/rac_error.cpp
277277
src/core/rac_time.cpp
278+
src/core/rac_benchmark.cpp
278279
src/core/rac_memory.cpp
279280
src/core/rac_logger.cpp
280281
src/core/rac_audio_utils.cpp

sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#ifndef RAC_LLM_LLAMACPP_H
1212
#define RAC_LLM_LLAMACPP_H
1313

14+
#include "rac/core/rac_benchmark.h"
1415
#include "rac/core/rac_error.h"
1516
#include "rac/core/rac_types.h"
1617
#include "rac/features/llm/rac_llm.h"
@@ -163,6 +164,27 @@ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream(
163164
rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
164165
rac_llm_llamacpp_stream_callback_fn callback, void* user_data);
165166

167+
/**
168+
* Generates text with streaming callback and benchmark timing.
169+
*
170+
* Same as rac_llm_llamacpp_generate_stream but captures benchmark timing:
171+
* - t2: Before prefill (llama_decode for prompt batch)
172+
* - t3: After prefill completes
173+
* - t5: When decode loop exits (last token)
174+
*
175+
* @param handle Service handle
176+
* @param prompt Input prompt text
177+
* @param options Generation options
178+
* @param callback Callback for each token
179+
* @param user_data User context passed to callback
180+
* @param timing_out Output: Benchmark timing (can be NULL for no timing)
181+
* @return RAC_SUCCESS or error code
182+
*/
183+
RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream_with_timing(
184+
rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
185+
rac_llm_llamacpp_stream_callback_fn callback, void* user_data,
186+
rac_benchmark_timing_t* timing_out);
187+
166188
/**
167189
* Cancels ongoing generation.
168190
*
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/**
2+
* @file rac_benchmark.h
3+
* @brief RunAnywhere Commons - Benchmark Timing Support
4+
*
5+
* This header provides types and functions for benchmark timing instrumentation.
6+
* The timing struct captures key timestamps during LLM inference for performance
7+
* measurement and analysis.
8+
*
9+
* Design principles:
10+
* - Zero overhead when not benchmarking: timing is opt-in via pointer parameter
11+
* - Monotonic clock: uses steady_clock for accurate cross-platform timing
12+
* - All timestamps are relative to a process-local epoch (not wall-clock)
13+
*/
14+
15+
#ifndef RAC_BENCHMARK_H
16+
#define RAC_BENCHMARK_H
17+
18+
#include "rac/core/rac_types.h"
19+
20+
#ifdef __cplusplus
21+
extern "C" {
22+
#endif
23+
24+
// =============================================================================
25+
// BENCHMARK TIMING STRUCT
26+
// =============================================================================
27+
28+
/**
29+
* Benchmark timing structure for LLM inference.
30+
*
31+
* Captures timestamps at key points during inference:
32+
* - t0: Request start (component API entry)
33+
* - t2: Prefill start (backend, before llama_decode for prompt)
34+
* - t3: Prefill end (backend, after llama_decode returns)
35+
* - t4: First token (component, first token callback)
36+
* - t5: Last token (backend, decode loop exits)
37+
* - t6: Request end (component, before complete callback)
38+
*
39+
* All timestamps are in milliseconds from a process-local epoch.
40+
* Use rac_monotonic_now_ms() to get comparable timestamps.
41+
*
42+
* Note: t1 is intentionally skipped to match the specification.
43+
*/
44+
typedef struct rac_benchmark_timing {
45+
/** t0: Request start - recorded at component API entry */
46+
int64_t t0_request_start_ms;
47+
48+
/** t2: Prefill start - recorded before llama_decode for prompt batch */
49+
int64_t t2_prefill_start_ms;
50+
51+
/** t3: Prefill end - recorded after llama_decode returns for prompt */
52+
int64_t t3_prefill_end_ms;
53+
54+
/** t4: First token - recorded when first token callback is invoked */
55+
int64_t t4_first_token_ms;
56+
57+
/** t5: Last token - recorded when decode loop exits */
58+
int64_t t5_last_token_ms;
59+
60+
/** t6: Request end - recorded before complete callback */
61+
int64_t t6_request_end_ms;
62+
63+
/** Number of tokens in the prompt */
64+
int32_t prompt_tokens;
65+
66+
/** Number of tokens generated */
67+
int32_t output_tokens;
68+
69+
/**
70+
* Status of the request:
71+
* - 0: Success
72+
* - Non-zero: Error code (from rac_result_t)
73+
*/
74+
int32_t status;
75+
76+
} rac_benchmark_timing_t;
77+
78+
// =============================================================================
79+
// BENCHMARK STATUS CODES
80+
// =============================================================================
81+
82+
/** Benchmark request completed successfully */
83+
#define RAC_BENCHMARK_STATUS_SUCCESS ((int32_t)0)
84+
85+
/** Benchmark request failed due to error */
86+
#define RAC_BENCHMARK_STATUS_ERROR ((int32_t)1)
87+
88+
/** Benchmark request timed out */
89+
#define RAC_BENCHMARK_STATUS_TIMEOUT ((int32_t)2)
90+
91+
/** Benchmark request was cancelled */
92+
#define RAC_BENCHMARK_STATUS_CANCELLED ((int32_t)3)
93+
94+
// =============================================================================
95+
// MONOTONIC TIME API
96+
// =============================================================================
97+
98+
/**
99+
* Gets the current monotonic time in milliseconds.
100+
*
101+
* Uses std::chrono::steady_clock for accurate, monotonic timing that is not
102+
* affected by system clock changes. The returned value is relative to a
103+
* process-local epoch (the first call to this function).
104+
*
105+
* This function is thread-safe and lock-free on all supported platforms.
106+
*
107+
* @return Current monotonic time in milliseconds from process-local epoch
108+
*/
109+
RAC_API int64_t rac_monotonic_now_ms(void);
110+
111+
// =============================================================================
112+
// UTILITY FUNCTIONS
113+
// =============================================================================
114+
115+
/**
116+
* Initializes a benchmark timing struct to zero values.
117+
*
118+
* @param timing Pointer to timing struct to initialize
119+
*/
120+
RAC_API void rac_benchmark_timing_init(rac_benchmark_timing_t* timing);
121+
122+
#ifdef __cplusplus
123+
}
124+
#endif
125+
126+
#endif /* RAC_BENCHMARK_H */

sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#define RAC_LLM_COMPONENT_H
1414

1515
#include "rac/core/capabilities/rac_lifecycle.h"
16+
#include "rac/core/rac_benchmark.h"
1617
#include "rac/core/rac_error.h"
1718
#include "rac/features/llm/rac_llm_types.h"
1819

@@ -196,6 +197,36 @@ RAC_API rac_result_t rac_llm_component_generate_stream(
196197
rac_llm_component_complete_callback_fn complete_callback,
197198
rac_llm_component_error_callback_fn error_callback, void* user_data);
198199

200+
/**
201+
* @brief Generate text with streaming and benchmark timing
202+
*
203+
* Same as rac_llm_component_generate_stream but with optional benchmark timing.
204+
* When timing_out is non-NULL, captures detailed timing information:
205+
* - t0: Request start (set at API entry)
206+
* - t4: First token (set in token callback)
207+
* - t6: Request end (set before complete callback)
208+
*
209+
* Backend timestamps (t2, t3, t5) are captured by the backend if it supports timing.
210+
*
211+
* Zero overhead when timing_out is NULL - behaves exactly like generate_stream.
212+
*
213+
* @param handle Component handle
214+
* @param prompt Input prompt
215+
* @param options Generation options (can be NULL for defaults)
216+
* @param token_callback Called for each generated token
217+
* @param complete_callback Called when generation completes
218+
* @param error_callback Called on error
219+
* @param user_data User context passed to callbacks
220+
* @param timing_out Output: Benchmark timing (can be NULL for no timing)
221+
* @return RAC_SUCCESS or error code
222+
*/
223+
RAC_API rac_result_t rac_llm_component_generate_stream_with_timing(
224+
rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
225+
rac_llm_component_token_callback_fn token_callback,
226+
rac_llm_component_complete_callback_fn complete_callback,
227+
rac_llm_component_error_callback_fn error_callback, void* user_data,
228+
rac_benchmark_timing_t* timing_out);
229+
199230
/**
200231
* @brief Get lifecycle state
201232
*

sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#ifndef RAC_LLM_SERVICE_H
1111
#define RAC_LLM_SERVICE_H
1212

13+
#include "rac/core/rac_benchmark.h"
1314
#include "rac/core/rac_error.h"
1415
#include "rac/features/llm/rac_llm_types.h"
1516

@@ -38,6 +39,21 @@ typedef struct rac_llm_service_ops {
3839
const rac_llm_options_t* options,
3940
rac_llm_stream_callback_fn callback, void* user_data);
4041

42+
/**
43+
* Generate text with streaming callback and benchmark timing.
44+
* Optional: backends that don't support timing can leave this NULL.
45+
* If NULL, rac_llm_generate_stream_with_timing falls back to generate_stream.
46+
*
47+
* Backends that implement this should capture:
48+
* - t2: Before prefill (llama_decode for prompt)
49+
* - t3: After prefill completes
50+
* - t5: When decode loop exits (last token)
51+
*/
52+
rac_result_t (*generate_stream_with_timing)(void* impl, const char* prompt,
53+
const rac_llm_options_t* options,
54+
rac_llm_stream_callback_fn callback, void* user_data,
55+
rac_benchmark_timing_t* timing_out);
56+
4157
/** Get service info */
4258
rac_result_t (*get_info)(void* impl, rac_llm_info_t* out_info);
4359

@@ -146,6 +162,32 @@ RAC_API rac_result_t rac_llm_generate_stream(rac_handle_t handle, const char* pr
146162
const rac_llm_options_t* options,
147163
rac_llm_stream_callback_fn callback, void* user_data);
148164

165+
/**
166+
* @brief Stream generate text with benchmark timing
167+
*
168+
* Same as rac_llm_generate_stream but with optional benchmark timing.
169+
* If timing_out is non-NULL and the backend supports timing, captures:
170+
* - t2: Before prefill
171+
* - t3: After prefill
172+
* - t5: Last token generated
173+
*
174+
* If the backend doesn't implement generate_stream_with_timing, falls back
175+
* to generate_stream (timing_out will have t2/t3/t5 as zeros).
176+
*
177+
* @param handle Service handle
178+
* @param prompt Input prompt
179+
* @param options Generation options (can be NULL for defaults)
180+
* @param callback Callback for each token
181+
* @param user_data User context passed to callback
182+
* @param timing_out Output: Benchmark timing (can be NULL for no timing)
183+
* @return RAC_SUCCESS or error code
184+
*/
185+
RAC_API rac_result_t rac_llm_generate_stream_with_timing(rac_handle_t handle, const char* prompt,
186+
const rac_llm_options_t* options,
187+
rac_llm_stream_callback_fn callback,
188+
void* user_data,
189+
rac_benchmark_timing_t* timing_out);
190+
149191
/**
150192
* @brief Get service information
151193
*

0 commit comments

Comments
 (0)