RunanywhereAI
diff --git a/‎sdk/runanywhere-commons/include/rac/core/rac_benchmark_log.h‎
Lines changed: 47 additions & 15 deletions b/‎sdk/runanywhere-commons/include/rac/core/rac_benchmark_log.h‎
Lines changed: 47 additions & 15 deletions
diff --git a/‎sdk/runanywhere-commons/include/rac/core/rac_benchmark_stats.h‎
Lines changed: 12 additions & 0 deletions b/‎sdk/runanywhere-commons/include/rac/core/rac_benchmark_stats.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp‎
Lines changed: 29 additions & 186 deletions b/‎sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp‎
Lines changed: 29 additions & 186 deletions
@@ -5,18 +5,26 @@
  * Provides functions to serialize benchmark timing data as JSON or CSV,
  * and to log benchmark results via the RAC logging system.
  *
+ * All functions return rac_result_t for consistent error handling.
+ * Serialization functions write a heap-allocated string to an out-parameter
+ * (caller must free() on success).
+ *
  * Usage:
  *   // Log timing summary
  *   rac_benchmark_timing_log(&timing, "inference_run_1");
  *
  *   // Export as JSON
- *   char* json = rac_benchmark_timing_to_json(&timing);
- *   // ... use json ...
- *   free(json);
+ *   char* json = NULL;
+ *   if (rac_benchmark_timing_to_json(&timing, &json) == RAC_SUCCESS) {
+ *       // ... use json ...
+ *       free(json);
+ *   }
  *
  *   // Export as CSV
- *   char* header = rac_benchmark_timing_to_csv(NULL, RAC_TRUE);
- *   char* row = rac_benchmark_timing_to_csv(&timing, RAC_FALSE);
+ *   char* header = NULL;
+ *   char* row = NULL;
+ *   rac_benchmark_timing_to_csv(NULL, RAC_TRUE, &header);
+ *   rac_benchmark_timing_to_csv(&timing, RAC_FALSE, &row);
  *   free(header);
  *   free(row);
  */
@@ -25,6 +33,7 @@
 #define RAC_BENCHMARK_LOG_H
 
 #include "rac/core/rac_benchmark.h"
+#include "rac/core/rac_error.h"
 #include "rac/core/rac_types.h"
 
 #ifdef __cplusplus
@@ -45,10 +54,17 @@ extern "C" {
  * - e2e_ms: End-to-end latency (t6 - t0)
  * - decode_tps: Decode throughput (output_tokens / decode_ms * 1000)
  *
- * @param timing Timing struct to serialize (NULL returns NULL)
- * @return Heap-allocated JSON string (caller must free()), or NULL on error
+ * On success, *out_json is set to a heap-allocated string that the caller
+ * must release via free(). On failure, *out_json is set to NULL.
+ *
+ * @param timing   Timing struct to serialize (must not be NULL)
+ * @param out_json Output pointer that receives the JSON string (must not be NULL)
+ * @return RAC_SUCCESS on success,
+ *         RAC_ERROR_NULL_POINTER if timing or out_json is NULL,
+ *         RAC_ERROR_OUT_OF_MEMORY if allocation fails
  */
-RAC_API char* rac_benchmark_timing_to_json(const rac_benchmark_timing_t* timing);
+RAC_API rac_result_t rac_benchmark_timing_to_json(const rac_benchmark_timing_t* timing,
+                                                  char** out_json);
 
 // =============================================================================
 // CSV SERIALIZATION
@@ -57,11 +73,24 @@ RAC_API char* rac_benchmark_timing_to_json(const rac_benchmark_timing_t* timing)
 /**
  * Serializes a benchmark timing struct as a CSV row.
  *
- * @param timing Timing struct to serialize (ignored when header is RAC_TRUE)
- * @param header If RAC_TRUE, returns the CSV header row instead of data
- * @return Heap-allocated CSV string (caller must free()), or NULL on error
+ * When header is RAC_TRUE, emits the CSV header row (timing may be NULL).
+ * When header is RAC_FALSE, emits a data row (timing must not be NULL).
+ *
+ * On success, *out_csv is set to a heap-allocated string that the caller
+ * must release via free(). On failure, *out_csv is set to NULL.
+ *
+ * @param timing  Timing struct to serialize (ignored when header is RAC_TRUE,
+ *                otherwise must not be NULL)
+ * @param header  If RAC_TRUE, emits the CSV header row instead of data
+ * @param out_csv Output pointer that receives the CSV string (must not be NULL)
+ * @return RAC_SUCCESS on success,
+ *         RAC_ERROR_NULL_POINTER if out_csv is NULL, or if header is RAC_FALSE
+ *             and timing is NULL,
+ *         RAC_ERROR_OUT_OF_MEMORY if allocation fails
  */
-RAC_API char* rac_benchmark_timing_to_csv(const rac_benchmark_timing_t* timing, rac_bool_t header);
+RAC_API rac_result_t rac_benchmark_timing_to_csv(const rac_benchmark_timing_t* timing,
+                                                 rac_bool_t header,
+                                                 char** out_csv);
 
 // =============================================================================
 // LOGGING
@@ -75,10 +104,13 @@ RAC_API char* rac_benchmark_timing_to_csv(const rac_benchmark_timing_t* timing,
  * - Token counts and throughput
  * - Status and error code
  *
- * @param timing Timing struct to log (NULL is a no-op)
- * @param label Optional label for this benchmark run (can be NULL)
+ * @param timing Timing struct to log (must not be NULL)
+ * @param label  Optional label for this benchmark run (may be NULL)
+ * @return RAC_SUCCESS on success,
+ *         RAC_ERROR_NULL_POINTER if timing is NULL
  */
-RAC_API void rac_benchmark_timing_log(const rac_benchmark_timing_t* timing, const char* label);
+RAC_API rac_result_t rac_benchmark_timing_log(const rac_benchmark_timing_t* timing,
+                                              const char* label);
 
 #ifdef __cplusplus
 }
 
@@ -68,16 +68,28 @@ typedef struct rac_benchmark_summary {
     double prefill_p50_ms;
     double prefill_p95_ms;
     double prefill_p99_ms;
+    double prefill_min_ms;
+    double prefill_max_ms;
+    double prefill_mean_ms;
+    double prefill_stddev_ms;
 
     // Decode throughput stats (output_tokens / (t5 - t3) * 1000)
     double decode_tps_p50;
     double decode_tps_p95;
     double decode_tps_p99;
+    double decode_tps_min;
+    double decode_tps_max;
+    double decode_tps_mean;
+    double decode_tps_stddev;
 
     // End-to-end latency stats (t6 - t0)
     double e2e_p50_ms;
     double e2e_p95_ms;
     double e2e_p99_ms;
+    double e2e_min_ms;
+    double e2e_max_ms;
+    double e2e_mean_ms;
+    double e2e_stddev_ms;
 
     /** Number of observations where E2E > mean + 2*stddev */
     int32_t outlier_count;
 
@@ -598,7 +598,8 @@ TextGenerationResult LlamaCppTextGeneration::generate(const TextGenerationReques
 
 bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& request,
                                              TextStreamCallback callback,
-                                             int* out_prompt_tokens) {
+                                             int* out_prompt_tokens,
+                                             rac_benchmark_timing_t* timing_out) {
     std::lock_guard<std::mutex> lock(mutex_);
 
     if (!is_ready()) {
@@ -632,6 +633,12 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
 
     if (available_tokens <= 0) {
         RAC_LOG_ERROR("LLM.LlamaCpp","Prompt too long: %d tokens, context size: %d", prompt_tokens, n_ctx);
+        if (timing_out != nullptr) {
+            int64_t now = rac_monotonic_now_ms();
+            timing_out->t2_prefill_start_ms = now;
+            timing_out->t3_prefill_end_ms = now;
+            timing_out->t5_last_token_ms = now;
+        }
         return false;
     }
 
@@ -643,6 +650,11 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
     RAC_LOG_INFO("LLM.LlamaCpp", "generate_stream: processing %d prompt tokens in chunks of %d", prompt_tokens, n_batch);
     llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
+    // t2: Record prefill start (before the first llama_decode on the prompt chunks)
+    if (timing_out != nullptr) {
+        timing_out->t2_prefill_start_ms = rac_monotonic_now_ms();
+    }
+
     for (int chunk_start = 0; chunk_start < prompt_tokens; chunk_start += n_batch) {
         batch.n_tokens = 0;
         int chunk_end = std::min(chunk_start + n_batch, prompt_tokens);
@@ -655,12 +667,22 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
 
         if (llama_decode(context_, batch) != 0) {
             RAC_LOG_ERROR("LLM.LlamaCpp", "llama_decode failed for prompt chunk [%d..%d)", chunk_start, chunk_end);
+            if (timing_out != nullptr) {
+                int64_t now = rac_monotonic_now_ms();
+                timing_out->t3_prefill_end_ms = now;
+                timing_out->t5_last_token_ms = now;
+            }
             llama_batch_free(batch);
             return false;
         }
     }
     RAC_LOG_INFO("LLM.LlamaCpp", "generate_stream: prompt decoded successfully");
 
+    // t3: Record prefill end (after the prompt prefill loop completes)
+    if (timing_out != nullptr) {
+        timing_out->t3_prefill_end_ms = rac_monotonic_now_ms();
+    }
+
     // Configure sampler with request parameters — skip rebuild if params unchanged
     {
         const bool params_match = sampler_ &&
@@ -809,6 +831,12 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
         }
     }
 
+    // t5: Record last token time (decode loop exit)
+    if (timing_out != nullptr) {
+        timing_out->t5_last_token_ms = rac_monotonic_now_ms();
+        timing_out->output_tokens = static_cast<int32_t>(tokens_generated);
+    }
+
     // Flush any remaining partial UTF-8 bytes (e.g. trailing multi-byte char at end of generation)
     if (!cancel_requested_.load() && !stop_sequence_hit && !partial_utf8_buffer.empty()) {
         stop_window.append(partial_utf8_buffer);
@@ -828,191 +856,6 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
     return !cancel_requested_.load();
 }
 
-bool LlamaCppTextGeneration::generate_stream_with_timing(const TextGenerationRequest& request,
-                                                         TextStreamCallback callback,
-                                                         int* out_prompt_tokens,
-                                                         rac_benchmark_timing_t* timing_out) {
-    std::lock_guard<std::mutex> lock(mutex_);
-
-    if (!is_ready()) {
-        LOGE("Model not ready for generation");
-        return false;
-    }
-
-    cancel_requested_.store(false);
-
-    std::string prompt = build_prompt(request);
-    LOGI("Generating with timing, prompt length: %zu", prompt.length());
-
-    const auto tokens_list = common_tokenize(context_, prompt, true, true);
-
-    int n_ctx = llama_n_ctx(context_);
-    int prompt_tokens = static_cast<int>(tokens_list.size());
-
-    if (out_prompt_tokens) {
-        *out_prompt_tokens = prompt_tokens;
-    }
-
-    int available_tokens = n_ctx - prompt_tokens - 4;
-
-    if (available_tokens <= 0) {
-        LOGE("Prompt too long: %d tokens, context size: %d", prompt_tokens, n_ctx);
-        return false;
-    }
-
-    int effective_max_tokens = std::min(request.max_tokens, available_tokens);
-    if (effective_max_tokens < request.max_tokens) {
-        LOGI("Capping max_tokens: %d → %d (context=%d, prompt=%d tokens)", request.max_tokens,
-             effective_max_tokens, n_ctx, prompt_tokens);
-    }
-    LOGI("Generation with timing: prompt_tokens=%d, max_tokens=%d, context=%d", prompt_tokens,
-         effective_max_tokens, n_ctx);
-
-    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
-
-    batch.n_tokens = 0;
-    for (size_t i = 0; i < tokens_list.size(); i++) {
-        common_batch_add(batch, tokens_list[i], i, {0}, false);
-    }
-    batch.logits[batch.n_tokens - 1] = true;
-
-    // t2: Record prefill start (before llama_decode for prompt)
-    if (timing_out != nullptr) {
-        timing_out->t2_prefill_start_ms = rac_monotonic_now_ms();
-    }
-
-    if (llama_decode(context_, batch) != 0) {
-        LOGE("llama_decode failed for prompt");
-        if (timing_out != nullptr) {
-            int64_t now = rac_monotonic_now_ms();
-            timing_out->t3_prefill_end_ms = now;
-            timing_out->t5_last_token_ms = now;
-        }
-        llama_batch_free(batch);
-        return false;
-    }
-
-    // t3: Record prefill end (after llama_decode returns)
-    if (timing_out != nullptr) {
-        timing_out->t3_prefill_end_ms = rac_monotonic_now_ms();
-    }
-
-    llama_sampler_reset(sampler_);
-
-    const auto vocab = llama_model_get_vocab(model_);
-
-    static const std::vector<std::string> STOP_SEQUENCES = {
-        "<|im_end|>", "<|eot_id|>", "</s>", "<|end|>", "<|endoftext|>",
-        "\n\nUser:", "\n\nHuman:",
-    };
-
-    static const size_t MAX_STOP_LEN = []{
-        size_t m = 0;
-        for (const auto& s : STOP_SEQUENCES) m = std::max(m, s.size());
-        return m;
-    }();
-
-    std::string stop_window;
-    stop_window.reserve(MAX_STOP_LEN * 2);
-
-    std::string partial_utf8_buffer;
-    partial_utf8_buffer.reserve(8);
-
-    int n_cur = batch.n_tokens;
-    int tokens_generated = 0;
-    bool stop_sequence_hit = false;
-
-    while (tokens_generated < effective_max_tokens && !cancel_requested_.load()) {
-        const llama_token new_token_id = llama_sampler_sample(sampler_, context_, -1);
-
-        llama_sampler_accept(sampler_, new_token_id);
-
-        if (llama_vocab_is_eog(vocab, new_token_id)) {
-            LOGI("End of generation token received");
-            break;
-        }
-
-        const std::string new_token_chars =
-            common_token_to_piece(context_, new_token_id);
-
-        partial_utf8_buffer.append(new_token_chars);
-
-        Utf8State scanner_state;
-        size_t valid_upto = 0;
-        for (size_t i = 0; i < partial_utf8_buffer.size(); ++i) {
-            scanner_state.process(static_cast<uint8_t>(partial_utf8_buffer[i]));
-            if (scanner_state.state == 0) {
-                valid_upto = i + 1;
-            }
-        }
-
-        if (valid_upto > 0) {
-            std::string valid_chunk = partial_utf8_buffer.substr(0, valid_upto);
-            stop_window.append(valid_chunk);
-            partial_utf8_buffer.erase(0, valid_upto);
-
-            size_t found_stop_pos = std::string::npos;
-            for (const auto& stop_seq : STOP_SEQUENCES) {
-                size_t pos = stop_window.find(stop_seq);
-                if (pos != std::string::npos) {
-                    if (found_stop_pos == std::string::npos || pos < found_stop_pos) {
-                        found_stop_pos = pos;
-                    }
-                }
-            }
-
-            if (found_stop_pos != std::string::npos) {
-                LOGI("Stop sequence detected");
-                stop_sequence_hit = true;
-                if (found_stop_pos > 0) {
-                    if (!callback(stop_window.substr(0, found_stop_pos))) {
-                        cancel_requested_.store(true);
-                    }
-                }
-                break;
-            }
-
-            if (stop_window.size() > MAX_STOP_LEN) {
-                size_t safe_len = stop_window.size() - MAX_STOP_LEN;
-                if (!callback(stop_window.substr(0, safe_len))) {
-                    LOGI("Generation cancelled by callback");
-                    cancel_requested_.store(true);
-                    break;
-                }
-                stop_window.erase(0, safe_len);
-            }
-        }
-
-        batch.n_tokens = 0;
-        common_batch_add(batch, new_token_id, n_cur, {0}, true);
-
-        n_cur++;
-        tokens_generated++;
-
-        if (llama_decode(context_, batch) != 0) {
-            LOGE("llama_decode failed during generation");
-            break;
-        }
-    }
-
-    // t5: Record last token time (decode loop exit)
-    if (timing_out != nullptr) {
-        timing_out->t5_last_token_ms = rac_monotonic_now_ms();
-        timing_out->output_tokens = static_cast<int32_t>(tokens_generated);
-    }
-
-    if (!cancel_requested_.load() && !stop_sequence_hit && !stop_window.empty()) {
-        callback(stop_window);
-    }
-
-    llama_memory_clear(llama_get_memory(context_), true);
-
-    llama_batch_free(batch);
-
-    LOGI("Generation with timing complete: %d tokens", tokens_generated);
-    return !cancel_requested_.load();
-}
-
 void LlamaCppTextGeneration::cancel() {
     cancel_requested_.store(true);
     RAC_LOG_INFO("LLM.LlamaCpp","Generation cancel requested");