fix: address CodeRabbit review — stop-sequence, format specifiers, race condition, token counts

AmanSwar · sanchitmonga22 · commit bacb277ac072 · 2026-04-14T00:12:47.000-07:00
- Stop-sequence sliding window (llamacpp_backend.cpp): port the
  Utf8State/stop_window approach from generate_stream to the timing
  variant generate_stream_with_timing, matching the non-timing variant's
  behavior exactly.

- PRId32 format (rac_benchmark_log.cpp): change %d to PRId32 for all
  int32_t fields to match the PRId64 convention for int64_t fields.

- Mutex guard (rac_benchmark_metrics.cpp): add static std::mutex around
  the double-buffer write path in rac_benchmark_set_metrics_provider to
  prevent torn fn/user_data pairs. Reader side remains lock-free.

- Actual token counts (llamacpp_backend.cpp + llm_component.cpp): write
  tokens_generated to timing_out-&gt;output_tokens in the backend; read
  backend-populated prompt_tokens/output_tokens in the component layer
  instead of overwriting with estimate_tokens() heuristics.
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
@@ -900,10 +900,27 @@ bool LlamaCppTextGeneration::generate_stream_with_timing(const TextGenerationReq
     llama_sampler_reset(sampler_);
 
     const auto vocab = llama_model_get_vocab(model_);
-    std::string cached_token_chars;
-    std::string accumulated_text;
+
+    static const std::vector<std::string> STOP_SEQUENCES = {
+        "<|im_end|>", "<|eot_id|>", "</s>", "<|end|>", "<|endoftext|>",
+        "\n\nUser:", "\n\nHuman:",
+    };
+
+    static const size_t MAX_STOP_LEN = []{
+        size_t m = 0;
+        for (const auto& s : STOP_SEQUENCES) m = std::max(m, s.size());
+        return m;
+    }();
+
+    std::string stop_window;
+    stop_window.reserve(MAX_STOP_LEN * 2);
+
+    std::string partial_utf8_buffer;
+    partial_utf8_buffer.reserve(8);
+
     int n_cur = batch.n_tokens;
     int tokens_generated = 0;
+    bool stop_sequence_hit = false;
 
     while (tokens_generated < effective_max_tokens && !cancel_requested_.load()) {
         const llama_token new_token_id = llama_sampler_sample(sampler_, context_, -1);
@@ -915,41 +932,55 @@ bool LlamaCppTextGeneration::generate_stream_with_timing(const TextGenerationReq
             break;
         }
 
-        auto new_token_chars = common_token_to_piece(context_, new_token_id);
-        cached_token_chars += new_token_chars;
-        accumulated_text += new_token_chars;
-
-        static const std::vector<std::string> stop_sequences = {
-            "<|im_end|>",
-            "<|eot_id|>",
-            "</s>",
-            "<|end|>",
-            "<|endoftext|>",
-            "\n\nUser:",
-            "\n\nHuman:",
-        };
+        const std::string new_token_chars =
+            common_token_to_piece(context_, new_token_id);
 
-        bool hit_stop_sequence = false;
-        for (const auto& stop_seq : stop_sequences) {
-            size_t pos = accumulated_text.find(stop_seq);
-            if (pos != std::string::npos) {
-                LOGI("Stop sequence detected: %s", stop_seq.c_str());
-                hit_stop_sequence = true;
-                break;
+        partial_utf8_buffer.append(new_token_chars);
+
+        Utf8State scanner_state;
+        size_t valid_upto = 0;
+        for (size_t i = 0; i < partial_utf8_buffer.size(); ++i) {
+            scanner_state.process(static_cast<uint8_t>(partial_utf8_buffer[i]));
+            if (scanner_state.state == 0) {
+                valid_upto = i + 1;
             }
         }
 
-        if (hit_stop_sequence) {
-            break;
-        }
+        if (valid_upto > 0) {
+            std::string valid_chunk = partial_utf8_buffer.substr(0, valid_upto);
+            stop_window.append(valid_chunk);
+            partial_utf8_buffer.erase(0, valid_upto);
 
-        if (is_valid_utf8(cached_token_chars.c_str())) {
-            if (!callback(cached_token_chars)) {
-                LOGI("Generation cancelled by callback");
-                cancel_requested_.store(true);
+            size_t found_stop_pos = std::string::npos;
+            for (const auto& stop_seq : STOP_SEQUENCES) {
+                size_t pos = stop_window.find(stop_seq);
+                if (pos != std::string::npos) {
+                    if (found_stop_pos == std::string::npos || pos < found_stop_pos) {
+                        found_stop_pos = pos;
+                    }
+                }
+            }
+
+            if (found_stop_pos != std::string::npos) {
+                LOGI("Stop sequence detected");
+                stop_sequence_hit = true;
+                if (found_stop_pos > 0) {
+                    if (!callback(stop_window.substr(0, found_stop_pos))) {
+                        cancel_requested_.store(true);
+                    }
+                }
                 break;
             }
-            cached_token_chars.clear();
+
+            if (stop_window.size() > MAX_STOP_LEN) {
+                size_t safe_len = stop_window.size() - MAX_STOP_LEN;
+                if (!callback(stop_window.substr(0, safe_len))) {
+                    LOGI("Generation cancelled by callback");
+                    cancel_requested_.store(true);
+                    break;
+                }
+                stop_window.erase(0, safe_len);
+            }
         }
 
         batch.n_tokens = 0;
@@ -967,10 +998,11 @@ bool LlamaCppTextGeneration::generate_stream_with_timing(const TextGenerationReq
     // t5: Record last token time (decode loop exit)
     if (timing_out != nullptr) {
         timing_out->t5_last_token_ms = rac_monotonic_now_ms();
+        timing_out->output_tokens = static_cast<int32_t>(tokens_generated);
     }
 
-    if (!cached_token_chars.empty() && is_valid_utf8(cached_token_chars.c_str())) {
-        callback(cached_token_chars);
+    if (!cancel_requested_.load() && !stop_sequence_hit && !stop_window.empty()) {
+        callback(stop_window);
     }
 
     llama_memory_clear(llama_get_memory(context_), true);
diff --git a/sdk/runanywhere-commons/src/core/rac_benchmark_log.cpp b/sdk/runanywhere-commons/src/core/rac_benchmark_log.cpp
@@ -117,7 +117,7 @@ char* rac_benchmark_timing_to_csv(const rac_benchmark_timing_t* timing, rac_bool
         char buf[512];
         snprintf(buf, sizeof(buf),
                  "%" PRId64 ",%" PRId64 ",%" PRId64 ",%" PRId64 ",%" PRId64 ",%" PRId64
-                 ",%d,%d,%d,%d,%.2f,%.2f,%.2f,%.2f,%.2f",
+                 ",%" PRId32 ",%" PRId32 ",%" PRId32 ",%" PRId32 ",%.2f,%.2f,%.2f,%.2f,%.2f",
                  timing->t0_request_start_ms, timing->t2_prefill_start_ms,
                  timing->t3_prefill_end_ms, timing->t4_first_token_ms, timing->t5_last_token_ms,
                  timing->t6_request_end_ms, timing->prompt_tokens, timing->output_tokens,
diff --git a/sdk/runanywhere-commons/src/core/rac_benchmark_metrics.cpp b/sdk/runanywhere-commons/src/core/rac_benchmark_metrics.cpp
@@ -11,6 +11,7 @@
 
 #include <atomic>
 #include <cstring>
+#include <mutex>
 
 namespace {
 
@@ -45,12 +46,15 @@ void rac_benchmark_extended_metrics_init(rac_benchmark_extended_metrics_t* metri
 
 void rac_benchmark_set_metrics_provider(rac_benchmark_metrics_provider_fn provider,
                                          void* user_data) {
+    static std::mutex write_mutex;
+
     if (provider == nullptr) {
         g_provider.store(nullptr, std::memory_order_release);
         return;
     }
 
-    // Use double-buffering to avoid data races on the provider struct
+    // Serialize the rare registration path to prevent torn fn/user_data pairs
+    std::lock_guard<std::mutex> lock(write_mutex);
     int idx = g_provider_index.load(std::memory_order_relaxed);
     int next = 1 - idx;
     g_provider_storage[next].fn = provider;
diff --git a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp
@@ -947,8 +947,20 @@ extern "C" rac_result_t rac_llm_component_generate_stream_with_timing(
 
     rac_llm_result_t final_result = {};
     final_result.text = strdup(ctx.full_text.c_str());
-    final_result.prompt_tokens = ctx.prompt_tokens;
-    final_result.completion_tokens = estimate_tokens(ctx.full_text.c_str());
+
+    // Use actual backend token counts if available, fall back to estimates
+    if (timing_out != nullptr && timing_out->prompt_tokens > 0) {
+        final_result.prompt_tokens = timing_out->prompt_tokens;
+    } else {
+        final_result.prompt_tokens = ctx.prompt_tokens;
+    }
+
+    if (timing_out != nullptr && timing_out->output_tokens > 0) {
+        final_result.completion_tokens = timing_out->output_tokens;
+    } else {
+        final_result.completion_tokens = estimate_tokens(ctx.full_text.c_str());
+    }
+
     final_result.total_tokens = final_result.prompt_tokens + final_result.completion_tokens;
     final_result.total_time_ms = total_time_ms;
 
@@ -972,8 +984,7 @@ extern "C" rac_result_t rac_llm_component_generate_stream_with_timing(
     // Record t6 (request end) before complete callback
     if (timing_out != nullptr) {
         timing_out->t6_request_end_ms = rac_monotonic_now_ms();
-        timing_out->prompt_tokens = final_result.prompt_tokens;
-        timing_out->output_tokens = final_result.completion_tokens;
+        // prompt_tokens and output_tokens already set by backend
         timing_out->status = RAC_BENCHMARK_STATUS_SUCCESS;
         timing_out->error_code = RAC_SUCCESS;
     }