Skip to content

Commit 5907ce5

Browse files
fix(commons): address PR #469 review — 18 bot comments (greptile + coderabbit)
Quick fixes: - rac_llm_llamacpp.cpp: copy system_prompt in timed C-API; add try/catch around generate_stream_with_timing to prevent C++ exceptions from unwinding across extern "C" boundary (UB on WASM). - rac_benchmark_stats.cpp: use Bessel-corrected stddev (÷ N−1) for sample variability, not population stddev. - llm_component.cpp: use component->actual_framework (not preferred) in timing telemetry; null-check strdup before marking benchmark successful; backfill timing_out->prompt_tokens/output_tokens when backend falls back to plain streaming so decode-TPS stats are correct. - runanywhere_commons_jni.cpp (timing entrypoint): parse configJson for max_tokens/temperature/top_p/system_prompt; try onToken([B)Z before (Ljava/lang/String;)Z and propagate onTokenExpectsBytes into ctx; wait on cv for completion (10 min timeout) before DeleteGlobalRef to close a use-after-free race. Structural: - llamacpp_backend.{h,cpp}: consolidate generate_stream and generate_stream_with_timing into one function with optional rac_benchmark_timing_t*. Gates all timestamp writes on timing_out. Eliminates ~185 lines of duplicated code and restores three already-drifted fixes: KV-cache clear + decode_failed_ reset before each request (#356 regression), sampler chain rebuild on param change, and chunked prefill via batch_size_ (avoids n_ctx allocation spike). - rac_benchmark_log.{h,cpp}: migrate timing_to_json / timing_to_csv / timing_log from char*/void to rac_result_t + out-params, matching the project-wide include/rac/ convention. - rac_benchmark_stats.{h,cpp}: extend rac_benchmark_summary_t with full mean/stddev/min/max aggregates for prefill, decode-TPS, and E2E — not just TTFT. - rac_benchmark_metrics.cpp: replace two-slot atomic provider pointer with mutex-guarded std::shared_ptr so teardown is lifetime-safe against in-flight rac_benchmark_capture_metrics calls. Tests: 32/32 rac_benchmark_tests pass (+3 new coverage tests). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent bacb277 commit 5907ce5

12 files changed

Lines changed: 379 additions & 283 deletions

File tree

sdk/runanywhere-commons/include/rac/core/rac_benchmark_log.h

Lines changed: 47 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,26 @@
55
* Provides functions to serialize benchmark timing data as JSON or CSV,
66
* and to log benchmark results via the RAC logging system.
77
*
8+
* All functions return rac_result_t for consistent error handling.
9+
* Serialization functions write a heap-allocated string to an out-parameter
10+
* (caller must free() on success).
11+
*
812
* Usage:
913
* // Log timing summary
1014
* rac_benchmark_timing_log(&timing, "inference_run_1");
1115
*
1216
* // Export as JSON
13-
* char* json = rac_benchmark_timing_to_json(&timing);
14-
* // ... use json ...
15-
* free(json);
17+
* char* json = NULL;
18+
* if (rac_benchmark_timing_to_json(&timing, &json) == RAC_SUCCESS) {
19+
* // ... use json ...
20+
* free(json);
21+
* }
1622
*
1723
* // Export as CSV
18-
* char* header = rac_benchmark_timing_to_csv(NULL, RAC_TRUE);
19-
* char* row = rac_benchmark_timing_to_csv(&timing, RAC_FALSE);
24+
* char* header = NULL;
25+
* char* row = NULL;
26+
* rac_benchmark_timing_to_csv(NULL, RAC_TRUE, &header);
27+
* rac_benchmark_timing_to_csv(&timing, RAC_FALSE, &row);
2028
* free(header);
2129
* free(row);
2230
*/
@@ -25,6 +33,7 @@
2533
#define RAC_BENCHMARK_LOG_H
2634

2735
#include "rac/core/rac_benchmark.h"
36+
#include "rac/core/rac_error.h"
2837
#include "rac/core/rac_types.h"
2938

3039
#ifdef __cplusplus
@@ -45,10 +54,17 @@ extern "C" {
4554
* - e2e_ms: End-to-end latency (t6 - t0)
4655
* - decode_tps: Decode throughput (output_tokens / decode_ms * 1000)
4756
*
48-
* @param timing Timing struct to serialize (NULL returns NULL)
49-
* @return Heap-allocated JSON string (caller must free()), or NULL on error
57+
* On success, *out_json is set to a heap-allocated string that the caller
58+
* must release via free(). On failure, *out_json is set to NULL.
59+
*
60+
* @param timing Timing struct to serialize (must not be NULL)
61+
* @param out_json Output pointer that receives the JSON string (must not be NULL)
62+
* @return RAC_SUCCESS on success,
63+
* RAC_ERROR_NULL_POINTER if timing or out_json is NULL,
64+
* RAC_ERROR_OUT_OF_MEMORY if allocation fails
5065
*/
51-
RAC_API char* rac_benchmark_timing_to_json(const rac_benchmark_timing_t* timing);
66+
RAC_API rac_result_t rac_benchmark_timing_to_json(const rac_benchmark_timing_t* timing,
67+
char** out_json);
5268

5369
// =============================================================================
5470
// CSV SERIALIZATION
@@ -57,11 +73,24 @@ RAC_API char* rac_benchmark_timing_to_json(const rac_benchmark_timing_t* timing)
5773
/**
5874
* Serializes a benchmark timing struct as a CSV row.
5975
*
60-
* @param timing Timing struct to serialize (ignored when header is RAC_TRUE)
61-
* @param header If RAC_TRUE, returns the CSV header row instead of data
62-
* @return Heap-allocated CSV string (caller must free()), or NULL on error
76+
* When header is RAC_TRUE, emits the CSV header row (timing may be NULL).
77+
* When header is RAC_FALSE, emits a data row (timing must not be NULL).
78+
*
79+
* On success, *out_csv is set to a heap-allocated string that the caller
80+
* must release via free(). On failure, *out_csv is set to NULL.
81+
*
82+
* @param timing Timing struct to serialize (ignored when header is RAC_TRUE,
83+
* otherwise must not be NULL)
84+
* @param header If RAC_TRUE, emits the CSV header row instead of data
85+
* @param out_csv Output pointer that receives the CSV string (must not be NULL)
86+
* @return RAC_SUCCESS on success,
87+
* RAC_ERROR_NULL_POINTER if out_csv is NULL, or if header is RAC_FALSE
88+
* and timing is NULL,
89+
* RAC_ERROR_OUT_OF_MEMORY if allocation fails
6390
*/
64-
RAC_API char* rac_benchmark_timing_to_csv(const rac_benchmark_timing_t* timing, rac_bool_t header);
91+
RAC_API rac_result_t rac_benchmark_timing_to_csv(const rac_benchmark_timing_t* timing,
92+
rac_bool_t header,
93+
char** out_csv);
6594

6695
// =============================================================================
6796
// LOGGING
@@ -75,10 +104,13 @@ RAC_API char* rac_benchmark_timing_to_csv(const rac_benchmark_timing_t* timing,
75104
* - Token counts and throughput
76105
* - Status and error code
77106
*
78-
* @param timing Timing struct to log (NULL is a no-op)
79-
* @param label Optional label for this benchmark run (can be NULL)
107+
* @param timing Timing struct to log (must not be NULL)
108+
* @param label Optional label for this benchmark run (may be NULL)
109+
* @return RAC_SUCCESS on success,
110+
* RAC_ERROR_NULL_POINTER if timing is NULL
80111
*/
81-
RAC_API void rac_benchmark_timing_log(const rac_benchmark_timing_t* timing, const char* label);
112+
RAC_API rac_result_t rac_benchmark_timing_log(const rac_benchmark_timing_t* timing,
113+
const char* label);
82114

83115
#ifdef __cplusplus
84116
}

sdk/runanywhere-commons/include/rac/core/rac_benchmark_stats.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,16 +68,28 @@ typedef struct rac_benchmark_summary {
6868
double prefill_p50_ms;
6969
double prefill_p95_ms;
7070
double prefill_p99_ms;
71+
double prefill_min_ms;
72+
double prefill_max_ms;
73+
double prefill_mean_ms;
74+
double prefill_stddev_ms;
7175

7276
// Decode throughput stats (output_tokens / (t5 - t3) * 1000)
7377
double decode_tps_p50;
7478
double decode_tps_p95;
7579
double decode_tps_p99;
80+
double decode_tps_min;
81+
double decode_tps_max;
82+
double decode_tps_mean;
83+
double decode_tps_stddev;
7684

7785
// End-to-end latency stats (t6 - t0)
7886
double e2e_p50_ms;
7987
double e2e_p95_ms;
8088
double e2e_p99_ms;
89+
double e2e_min_ms;
90+
double e2e_max_ms;
91+
double e2e_mean_ms;
92+
double e2e_stddev_ms;
8193

8294
/** Number of observations where E2E > mean + 2*stddev */
8395
int32_t outlier_count;

sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp

Lines changed: 29 additions & 186 deletions
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,8 @@ TextGenerationResult LlamaCppTextGeneration::generate(const TextGenerationReques
598598

599599
bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& request,
600600
TextStreamCallback callback,
601-
int* out_prompt_tokens) {
601+
int* out_prompt_tokens,
602+
rac_benchmark_timing_t* timing_out) {
602603
std::lock_guard<std::mutex> lock(mutex_);
603604

604605
if (!is_ready()) {
@@ -632,6 +633,12 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
632633

633634
if (available_tokens <= 0) {
634635
RAC_LOG_ERROR("LLM.LlamaCpp","Prompt too long: %d tokens, context size: %d", prompt_tokens, n_ctx);
636+
if (timing_out != nullptr) {
637+
int64_t now = rac_monotonic_now_ms();
638+
timing_out->t2_prefill_start_ms = now;
639+
timing_out->t3_prefill_end_ms = now;
640+
timing_out->t5_last_token_ms = now;
641+
}
635642
return false;
636643
}
637644

@@ -643,6 +650,11 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
643650
RAC_LOG_INFO("LLM.LlamaCpp", "generate_stream: processing %d prompt tokens in chunks of %d", prompt_tokens, n_batch);
644651
llama_batch batch = llama_batch_init(n_batch, 0, 1);
645652

653+
// t2: Record prefill start (before the first llama_decode on the prompt chunks)
654+
if (timing_out != nullptr) {
655+
timing_out->t2_prefill_start_ms = rac_monotonic_now_ms();
656+
}
657+
646658
for (int chunk_start = 0; chunk_start < prompt_tokens; chunk_start += n_batch) {
647659
batch.n_tokens = 0;
648660
int chunk_end = std::min(chunk_start + n_batch, prompt_tokens);
@@ -655,12 +667,22 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
655667

656668
if (llama_decode(context_, batch) != 0) {
657669
RAC_LOG_ERROR("LLM.LlamaCpp", "llama_decode failed for prompt chunk [%d..%d)", chunk_start, chunk_end);
670+
if (timing_out != nullptr) {
671+
int64_t now = rac_monotonic_now_ms();
672+
timing_out->t3_prefill_end_ms = now;
673+
timing_out->t5_last_token_ms = now;
674+
}
658675
llama_batch_free(batch);
659676
return false;
660677
}
661678
}
662679
RAC_LOG_INFO("LLM.LlamaCpp", "generate_stream: prompt decoded successfully");
663680

681+
// t3: Record prefill end (after the prompt prefill loop completes)
682+
if (timing_out != nullptr) {
683+
timing_out->t3_prefill_end_ms = rac_monotonic_now_ms();
684+
}
685+
664686
// Configure sampler with request parameters — skip rebuild if params unchanged
665687
{
666688
const bool params_match = sampler_ &&
@@ -809,6 +831,12 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
809831
}
810832
}
811833

834+
// t5: Record last token time (decode loop exit)
835+
if (timing_out != nullptr) {
836+
timing_out->t5_last_token_ms = rac_monotonic_now_ms();
837+
timing_out->output_tokens = static_cast<int32_t>(tokens_generated);
838+
}
839+
812840
// Flush any remaining partial UTF-8 bytes (e.g. trailing multi-byte char at end of generation)
813841
if (!cancel_requested_.load() && !stop_sequence_hit && !partial_utf8_buffer.empty()) {
814842
stop_window.append(partial_utf8_buffer);
@@ -828,191 +856,6 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
828856
return !cancel_requested_.load();
829857
}
830858

831-
bool LlamaCppTextGeneration::generate_stream_with_timing(const TextGenerationRequest& request,
832-
TextStreamCallback callback,
833-
int* out_prompt_tokens,
834-
rac_benchmark_timing_t* timing_out) {
835-
std::lock_guard<std::mutex> lock(mutex_);
836-
837-
if (!is_ready()) {
838-
LOGE("Model not ready for generation");
839-
return false;
840-
}
841-
842-
cancel_requested_.store(false);
843-
844-
std::string prompt = build_prompt(request);
845-
LOGI("Generating with timing, prompt length: %zu", prompt.length());
846-
847-
const auto tokens_list = common_tokenize(context_, prompt, true, true);
848-
849-
int n_ctx = llama_n_ctx(context_);
850-
int prompt_tokens = static_cast<int>(tokens_list.size());
851-
852-
if (out_prompt_tokens) {
853-
*out_prompt_tokens = prompt_tokens;
854-
}
855-
856-
int available_tokens = n_ctx - prompt_tokens - 4;
857-
858-
if (available_tokens <= 0) {
859-
LOGE("Prompt too long: %d tokens, context size: %d", prompt_tokens, n_ctx);
860-
return false;
861-
}
862-
863-
int effective_max_tokens = std::min(request.max_tokens, available_tokens);
864-
if (effective_max_tokens < request.max_tokens) {
865-
LOGI("Capping max_tokens: %d → %d (context=%d, prompt=%d tokens)", request.max_tokens,
866-
effective_max_tokens, n_ctx, prompt_tokens);
867-
}
868-
LOGI("Generation with timing: prompt_tokens=%d, max_tokens=%d, context=%d", prompt_tokens,
869-
effective_max_tokens, n_ctx);
870-
871-
llama_batch batch = llama_batch_init(n_ctx, 0, 1);
872-
873-
batch.n_tokens = 0;
874-
for (size_t i = 0; i < tokens_list.size(); i++) {
875-
common_batch_add(batch, tokens_list[i], i, {0}, false);
876-
}
877-
batch.logits[batch.n_tokens - 1] = true;
878-
879-
// t2: Record prefill start (before llama_decode for prompt)
880-
if (timing_out != nullptr) {
881-
timing_out->t2_prefill_start_ms = rac_monotonic_now_ms();
882-
}
883-
884-
if (llama_decode(context_, batch) != 0) {
885-
LOGE("llama_decode failed for prompt");
886-
if (timing_out != nullptr) {
887-
int64_t now = rac_monotonic_now_ms();
888-
timing_out->t3_prefill_end_ms = now;
889-
timing_out->t5_last_token_ms = now;
890-
}
891-
llama_batch_free(batch);
892-
return false;
893-
}
894-
895-
// t3: Record prefill end (after llama_decode returns)
896-
if (timing_out != nullptr) {
897-
timing_out->t3_prefill_end_ms = rac_monotonic_now_ms();
898-
}
899-
900-
llama_sampler_reset(sampler_);
901-
902-
const auto vocab = llama_model_get_vocab(model_);
903-
904-
static const std::vector<std::string> STOP_SEQUENCES = {
905-
"<|im_end|>", "<|eot_id|>", "</s>", "<|end|>", "<|endoftext|>",
906-
"\n\nUser:", "\n\nHuman:",
907-
};
908-
909-
static const size_t MAX_STOP_LEN = []{
910-
size_t m = 0;
911-
for (const auto& s : STOP_SEQUENCES) m = std::max(m, s.size());
912-
return m;
913-
}();
914-
915-
std::string stop_window;
916-
stop_window.reserve(MAX_STOP_LEN * 2);
917-
918-
std::string partial_utf8_buffer;
919-
partial_utf8_buffer.reserve(8);
920-
921-
int n_cur = batch.n_tokens;
922-
int tokens_generated = 0;
923-
bool stop_sequence_hit = false;
924-
925-
while (tokens_generated < effective_max_tokens && !cancel_requested_.load()) {
926-
const llama_token new_token_id = llama_sampler_sample(sampler_, context_, -1);
927-
928-
llama_sampler_accept(sampler_, new_token_id);
929-
930-
if (llama_vocab_is_eog(vocab, new_token_id)) {
931-
LOGI("End of generation token received");
932-
break;
933-
}
934-
935-
const std::string new_token_chars =
936-
common_token_to_piece(context_, new_token_id);
937-
938-
partial_utf8_buffer.append(new_token_chars);
939-
940-
Utf8State scanner_state;
941-
size_t valid_upto = 0;
942-
for (size_t i = 0; i < partial_utf8_buffer.size(); ++i) {
943-
scanner_state.process(static_cast<uint8_t>(partial_utf8_buffer[i]));
944-
if (scanner_state.state == 0) {
945-
valid_upto = i + 1;
946-
}
947-
}
948-
949-
if (valid_upto > 0) {
950-
std::string valid_chunk = partial_utf8_buffer.substr(0, valid_upto);
951-
stop_window.append(valid_chunk);
952-
partial_utf8_buffer.erase(0, valid_upto);
953-
954-
size_t found_stop_pos = std::string::npos;
955-
for (const auto& stop_seq : STOP_SEQUENCES) {
956-
size_t pos = stop_window.find(stop_seq);
957-
if (pos != std::string::npos) {
958-
if (found_stop_pos == std::string::npos || pos < found_stop_pos) {
959-
found_stop_pos = pos;
960-
}
961-
}
962-
}
963-
964-
if (found_stop_pos != std::string::npos) {
965-
LOGI("Stop sequence detected");
966-
stop_sequence_hit = true;
967-
if (found_stop_pos > 0) {
968-
if (!callback(stop_window.substr(0, found_stop_pos))) {
969-
cancel_requested_.store(true);
970-
}
971-
}
972-
break;
973-
}
974-
975-
if (stop_window.size() > MAX_STOP_LEN) {
976-
size_t safe_len = stop_window.size() - MAX_STOP_LEN;
977-
if (!callback(stop_window.substr(0, safe_len))) {
978-
LOGI("Generation cancelled by callback");
979-
cancel_requested_.store(true);
980-
break;
981-
}
982-
stop_window.erase(0, safe_len);
983-
}
984-
}
985-
986-
batch.n_tokens = 0;
987-
common_batch_add(batch, new_token_id, n_cur, {0}, true);
988-
989-
n_cur++;
990-
tokens_generated++;
991-
992-
if (llama_decode(context_, batch) != 0) {
993-
LOGE("llama_decode failed during generation");
994-
break;
995-
}
996-
}
997-
998-
// t5: Record last token time (decode loop exit)
999-
if (timing_out != nullptr) {
1000-
timing_out->t5_last_token_ms = rac_monotonic_now_ms();
1001-
timing_out->output_tokens = static_cast<int32_t>(tokens_generated);
1002-
}
1003-
1004-
if (!cancel_requested_.load() && !stop_sequence_hit && !stop_window.empty()) {
1005-
callback(stop_window);
1006-
}
1007-
1008-
llama_memory_clear(llama_get_memory(context_), true);
1009-
1010-
llama_batch_free(batch);
1011-
1012-
LOGI("Generation with timing complete: %d tokens", tokens_generated);
1013-
return !cancel_requested_.load();
1014-
}
1015-
1016859
void LlamaCppTextGeneration::cancel() {
1017860
cancel_requested_.store(true);
1018861
RAC_LOG_INFO("LLM.LlamaCpp","Generation cancel requested");

0 commit comments

Comments
 (0)